diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/00-official-bug-report-issue.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/00-official-bug-report-issue.md
deleted file mode 100644
index 51e08c26db66114de0b604bf0cc5c461311a0b4f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/00-official-bug-report-issue.md
+++ /dev/null
@@ -1,59 +0,0 @@
----
-name: "[Official Model] Bug Report"
-about: Use this template for reporting a bug for the “official” directory
-labels: type:bug,models:official
-
----
-
-# Prerequisites
-
-Please answer the following questions for yourself before submitting an issue.
-
-- [ ] I am using the latest TensorFlow Model Garden release and TensorFlow 2.
-- [ ] I am reporting the issue to the correct repository. (Model Garden official or research directory)
-- [ ] I checked to make sure that this issue has not been filed already.
-
-## 1. The entire URL of the file you are using
-
-https://github.com/tensorflow/models/tree/master/official/...
-
-## 2. Describe the bug
-
-A clear and concise description of what the bug is.
-
-## 3. Steps to reproduce
-
-Steps to reproduce the behavior.
-
-## 4. Expected behavior
-
-A clear and concise description of what you expected to happen.
-
-## 5. Additional context
-
-Include any logs that would be helpful to diagnose the problem.
-
-## 6. System information
-
-- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
-- Mobile device name if the issue happens on a mobile device:
-- TensorFlow installed from (source or binary):
-- TensorFlow version (use command below):
-- Python version:
-- Bazel version (if compiling from source):
-- GCC/Compiler version (if compiling from source):
-- CUDA/cuDNN version:
-- GPU model and memory:
-
-<!-- 
-Collect system information using our environment capture script.
-https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh
-
-You can also obtain the TensorFlow version with:
-
-1. TensorFlow 1.0
-`python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"`
-
-2. TensorFlow 2.0
-`python -c "import tensorflow as tf; print(tf.version.GIT_VERSION, tf.version.VERSION)"`
--->
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/10-official-documentation-issue.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/10-official-documentation-issue.md
deleted file mode 100644
index 00d79a16916c327d2d8a729791db7d7d3d96b735..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/10-official-documentation-issue.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-name: "[Official Model] Documentation Issue"
-about: Use this template for reporting a documentation issue for the “official” directory
-labels: type:docs,models:official
-
----
-
-# Prerequisites
-
-Please answer the following question for yourself before submitting an issue.
-
-- [ ] I checked to make sure that this issue has not been filed already.
-
-## 1. The entire URL of the documentation with the issue
-
-https://github.com/tensorflow/models/tree/master/official/...
-
-## 2. Describe the issue
-
-A clear and concise description of what needs to be changed.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/20-official-feature-request-issue.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/20-official-feature-request-issue.md
deleted file mode 100644
index 02d8cab52218202707646345a4ab2570519660dd..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/20-official-feature-request-issue.md
+++ /dev/null
@@ -1,26 +0,0 @@
----
-name: "[Official Model] Feature request"
-about: Use this template for raising a feature request for the “official” directory
-labels: type:feature,models:official
-
----
-
-# Prerequisites
-
-Please answer the following question for yourself before submitting an issue.
-
-- [ ] I checked to make sure that this feature has not been requested already.
-
-## 1. The entire URL of the file you are using
-
-https://github.com/tensorflow/models/tree/master/official/...
-
-## 2. Describe the feature you request
-
-A clear and concise description of what you want to happen.
-
-## 3. Additional context
-
-Add any other context about the feature request here.
-
-## 4. Are you willing to contribute it? (Yes or No)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/30-research-bug-report-issue.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/30-research-bug-report-issue.md
deleted file mode 100644
index 4448ed9e40d6a089b84881635c2ee0f53524ae61..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/30-research-bug-report-issue.md
+++ /dev/null
@@ -1,58 +0,0 @@
----
-name: "[Research Model] Bug Report"
-about: Use this template for reporting a bug for the “research” directory
-labels: type:bug,models:research
-
----
-# Prerequisites
-
-Please answer the following questions for yourself before submitting an issue.
-
-- [ ] I am using the latest TensorFlow Model Garden release and TensorFlow 2.
-- [ ] I am reporting the issue to the correct repository. (Model Garden official or research directory)
-- [ ] I checked to make sure that this issue has not already been filed.
-
-## 1. The entire URL of the file you are using
-
-https://github.com/tensorflow/models/tree/master/research/...
-
-## 2. Describe the bug
-
-A clear and concise description of what the bug is.
-
-## 3. Steps to reproduce
-
-Steps to reproduce the behavior.
-
-## 4. Expected behavior
-
-A clear and concise description of what you expected to happen.
-
-## 5. Additional context
-
-Include any logs that would be helpful to diagnose the problem.
-
-## 6. System information
-
-- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
-- Mobile device name if the issue happens on a mobile device:
-- TensorFlow installed from (source or binary):
-- TensorFlow version (use command below):
-- Python version:
-- Bazel version (if compiling from source):
-- GCC/Compiler version (if compiling from source):
-- CUDA/cuDNN version:
-- GPU model and memory:
-
-<!-- 
-Collect system information using our environment capture script.
-https://github.com/tensorflow/tensorflow/tree/master/tools/tf_env_collect.sh
-
-You can also obtain the TensorFlow version with:
-
-1. TensorFlow 1.0
-`python -c "import tensorflow as tf; print(tf.GIT_VERSION, tf.VERSION)"`
-
-2. TensorFlow 2.0
-`python -c "import tensorflow as tf; print(tf.version.GIT_VERSION, tf.version.VERSION)"`
--->
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/40-research-documentation-issue.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/40-research-documentation-issue.md
deleted file mode 100644
index 26adfd83e1fbe27d045ecd8dfccef91bbd27fcf1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/40-research-documentation-issue.md
+++ /dev/null
@@ -1,20 +0,0 @@
----
-name: "[Research Model] Documentation Issue"
-about: Use this template for reporting a documentation issue for the “research” directory
-labels: type:docs,models:research
-
----
-
-# Prerequisites
-
-Please answer the following question for yourself before submitting an issue.
-
-- [ ] I checked to make sure that this issue has not been filed already.
-
-## 1. The entire URL of the documentation with the issue
-
-https://github.com/tensorflow/models/tree/master/research/...
-
-## 2. Describe the issue
-
-A clear and concise description of what needs to be changed.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/50-research-feature-request-issue.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/50-research-feature-request-issue.md
deleted file mode 100644
index 412942a31be9cc4c2935dcd38ecb059a8a4ec18c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/50-research-feature-request-issue.md
+++ /dev/null
@@ -1,26 +0,0 @@
----
-name: "[Research Model] Feature Request"
-about: Use this template for raising a feature request for the “research” directory
-labels: type:feature,models:research
-
----
-
-# Prerequisites
-
-Please answer the following question for yourself before submitting an issue.
-
-- [ ] I checked to make sure that this feature has not been requested already.
-
-## 1. The entire URL of the file you are using
-
-https://github.com/tensorflow/models/tree/master/research/...
-
-## 2. Describe the feature you request
-
-A clear and concise description of what you want to happen.
-
-## 3. Additional context
-
-Add any other context about the feature request here.
-
-## 4. Are you willing to contribute it? (Yes or No)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/60-questions-help-issue.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/60-questions-help-issue.md
deleted file mode 100644
index bc85e0bb019fd2d5960b822c18358f906d5264b7..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/60-questions-help-issue.md
+++ /dev/null
@@ -1,14 +0,0 @@
----
-name: Questions and Help
-about: Use this template for Questions and Help.
-labels: type:support
-
----
-<!--
-As per our GitHub Policy (https://github.com/tensorflow/models/blob/master/ISSUES.md), we only address code bugs, documentation issues, and feature requests on GitHub.
-
-We will automatically close questions and help related issues.
-
-Please go to Stack Overflow (http://stackoverflow.com/questions/tagged/tensorflow-model-garden) for questions and help.
-
--->
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/config.yml b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/config.yml
deleted file mode 100644
index 3ba13e0cec6cbbfd462e9ebf529dd2093148cd69..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/ISSUE_TEMPLATE/config.yml
+++ /dev/null
@@ -1 +0,0 @@
-blank_issues_enabled: false
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/PULL_REQUEST_TEMPLATE.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/PULL_REQUEST_TEMPLATE.md
deleted file mode 100644
index 379b31c57c118a174d4e787e03099288957f9fe2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/PULL_REQUEST_TEMPLATE.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# Description
-
-> :memo: Please include a summary of the change. 
->  
-> * Please also include relevant motivation and context.  
-> * List any dependencies that are required for this change.  
-
-## Type of change
-
-For a new feature or function, please create an issue first to discuss it
-with us before submitting a pull request.
-
-Note: Please delete options that are not relevant.
-
-- [ ] Bug fix (non-breaking change which fixes an issue)
-- [ ] Documentation update
-- [ ] TensorFlow 2 migration
-- [ ] New feature (non-breaking change which adds functionality)
-- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
-- [ ] A new research paper code implementation
-- [ ] Other (Specify)
-
-## Tests
-
-> :memo: Please describe the tests that you ran to verify your changes.
->  
-> * Provide instructions so we can reproduce.  
-> * Please also list any relevant details for your test configuration.  
-
-**Test Configuration**:
-
-## Checklist
-
-- [ ] I have signed the [Contributor License Agreement](https://github.com/tensorflow/models/wiki/Contributor-License-Agreements).
-- [ ] I have read [guidelines for pull request](https://github.com/tensorflow/models/wiki/Submitting-a-pull-request).
-- [ ] My code follows the [coding guidelines](https://github.com/tensorflow/models/wiki/Coding-guidelines).
-- [ ] I have performed a self [code review](https://github.com/tensorflow/models/wiki/Code-review) of my own code.
-- [ ] I have commented my code, particularly in hard-to-understand areas.
-- [ ] I have made corresponding changes to the documentation.
-- [ ] My changes generate no new warnings.
-- [ ] I have added tests that prove my fix is effective or that my feature works.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/README_TEMPLATE.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/README_TEMPLATE.md
deleted file mode 100644
index 43dba40f59684df0f79faa341c8de67916313210..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/README_TEMPLATE.md
+++ /dev/null
@@ -1,124 +0,0 @@
-> :memo: A README.md template for releasing a paper code implementation to a GitHub repository.  
->  
-> * Template version: 1.0.2020.170  
-> * Please modify sections depending on needs.  
-
-# Model name, Paper title, or Project Name
-
-> :memo: Add a badge for the ArXiv identifier of your paper (arXiv:YYMM.NNNNN)
-
-[![Paper](http://img.shields.io/badge/Paper-arXiv.YYMM.NNNNN-B3181B?logo=arXiv)](https://arxiv.org/abs/...)
-
-This repository is the official or unofficial implementation of the following paper.
-
-* Paper title: [Paper Title](https://arxiv.org/abs/YYMM.NNNNN)
-
-## Description
-
-> :memo: Provide description of the model.  
->  
-> * Provide brief information of the algorithms used.  
-> * Provide links for demos, blog posts, etc.  
-
-## History
-
-> :memo: Provide a changelog.
-
-## Authors or Maintainers
-
-> :memo: Provide maintainer information.  
-
-* Full name ([@GitHub username](https://github.com/username))
-* Full name ([@GitHub username](https://github.com/username))
-
-## Table of Contents
-
-> :memo: Provide a table of contents to help readers navigate a lengthy README document.
-
-## Requirements
-
-[![TensorFlow 2.1](https://img.shields.io/badge/TensorFlow-2.1-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v2.1.0)
-[![Python 3.6](https://img.shields.io/badge/Python-3.6-3776AB)](https://www.python.org/downloads/release/python-360/)
-
-> :memo: Provide details of the software required.  
->  
-> * Add a `requirements.txt` file to the root directory for installing the necessary dependencies.  
->   * Describe how to install requirements using pip.  
-> * Alternatively, create INSTALL.md.  
-
-To install requirements:
-
-```setup
-pip install -r requirements.txt
-```
-
-## Results
-
-[![TensorFlow Hub](https://img.shields.io/badge/TF%20Hub-Models-FF6F00?logo=tensorflow)](https://tfhub.dev/...)
-
-> :memo: Provide a table with results. (e.g., accuracy, latency)  
->  
-> * Provide links to the pre-trained models (checkpoint, SavedModel files).  
->   * Publish TensorFlow SavedModel files on TensorFlow Hub (tfhub.dev) if possible.  
-> * Add links to [TensorBoard.dev](https://tensorboard.dev/) for visualizing metrics.  
->  
-> An example table for image classification results  
->  
-> ### Image Classification  
->  
-> | Model name | Download | Top 1 Accuracy | Top 5 Accuracy |  
-> |------------|----------|----------------|----------------|  
-> | Model name | [Checkpoint](https://drive.google.com/...), [SavedModel](https://tfhub.dev/...) | xx% | xx% |  
-
-## Dataset
-
-> :memo: Provide information of the dataset used.  
-
-## Training
-
-> :memo: Provide training information.  
->  
-> * Provide details for preprocessing, hyperparameters, random seeds, and environment.  
-> * Provide a command line example for training.  
-
-Please run this command line for training.
-
-```shell
-python3 ...
-```
-
-## Evaluation
-
-> :memo: Provide an evaluation script with details of how to reproduce results.  
->  
-> * Describe data preprocessing / postprocessing steps.  
-> * Provide a command line example for evaluation.  
-
-Please run this command line for evaluation.
-
-```shell
-python3 ...
-```
-
-## References
-
-> :memo: Provide links to references.  
-
-## License
-
-[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
-
-> :memo: Place your license text in a file named LICENSE in the root of the repository.  
->  
-> * Include information about your license.  
-> * Reference: [Adding a license to a repository](https://help.github.com/en/github/building-a-strong-community/adding-a-license-to-a-repository)  
-
-This project is licensed under the terms of the **Apache License 2.0**.
-
-## Citation
-
-> :memo: Make your repository citable.  
->  
-> * Reference: [Making Your Code Citable](https://guides.github.com/activities/citable-code/)  
-
-If you want to cite this repository in your research paper, please use the following information.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/bot_config.yml b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/bot_config.yml
deleted file mode 100644
index 952afc316e78d823f865ef651981fda1dde32097..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/bot_config.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# throughout. Please refer to the TensorFlow dockerfiles documentation
-# for more information.
-
-# A list of assignees
-assignees:
-   - saikumarchalla
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/scripts/pylint.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/scripts/pylint.sh
deleted file mode 100644
index bb2ebebd8a87199a2138ef513cfd930af5b822bf..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/scripts/pylint.sh
+++ /dev/null
@@ -1,178 +0,0 @@
-#!/bin/bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Pylint wrapper extracted from main TensorFlow, sharing same exceptions.
-# Specify --incremental to only check files touched since last commit on master,
-# otherwise will recursively check current directory (full repo takes long!).
-
-set -euo pipefail
-
-# Download latest configs from main TensorFlow repo.
-wget -q -O /tmp/pylintrc https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/tools/ci_build/pylintrc
-
-SCRIPT_DIR=/tmp
-
-num_cpus() {
-  # Get the number of CPUs
-  if [[ -f /proc/cpuinfo ]]; then
-    N_CPUS=$(grep -c ^processor /proc/cpuinfo)
-  else
-    # Fallback method
-    N_CPUS=`getconf _NPROCESSORS_ONLN`
-  fi
-  if [[ -z ${N_CPUS} ]]; then
-    die "ERROR: Unable to determine the number of CPUs"
-  fi
-
-  echo ${N_CPUS}
-}
-
-get_changed_files_in_last_non_merge_git_commit() {
-  git diff --name-only $(git merge-base master $(git branch --show-current))
-}
-
-# List Python files changed in the last non-merge git commit that still exist,
-# i.e., not removed.
-# Usage: get_py_files_to_check [--incremental]
-get_py_files_to_check() {
-  if [[ "$1" == "--incremental" ]]; then
-    CHANGED_PY_FILES=$(get_changed_files_in_last_non_merge_git_commit | \
-                       grep '.*\.py$')
-
-    # Do not include files removed in the last non-merge commit.
-    PY_FILES=""
-    for PY_FILE in ${CHANGED_PY_FILES}; do
-      if [[ -f "${PY_FILE}" ]]; then
-        PY_FILES="${PY_FILES} ${PY_FILE}"
-      fi
-    done
-
-    echo "${PY_FILES}"
-  else
-    find . -name '*.py'
-  fi
-}
-
-do_pylint() {
-  if [[ $# == 1 ]] && [[ "$1" == "--incremental" ]]; then
-    PYTHON_SRC_FILES=$(get_py_files_to_check --incremental)
-
-    if [[ -z "${PYTHON_SRC_FILES}" ]]; then
-      echo "do_pylint will NOT run due to --incremental flag and due to the "\
-"absence of Python code changes in the last commit."
-      return 0
-    fi
-  elif [[ $# != 0 ]]; then
-    echo "Invalid syntax for invoking do_pylint"
-    echo "Usage: do_pylint [--incremental]"
-    return 1
-  else
-  PYTHON_SRC_FILES=$(get_py_files_to_check)
-  fi
-
-  # Something happened. TF no longer has Python code if this branch is taken
-  if [[ -z ${PYTHON_SRC_FILES} ]]; then
-    echo "do_pylint found no Python files to check. Returning."
-    return 0
-  fi
-
-  # Now that we know we have to do work, check if `pylint` is installed
-  PYLINT_BIN="python3.8 -m pylint"
-
-  echo ""
-  echo "check whether pylint is available or not."
-  echo ""
-  ${PYLINT_BIN} --version
-  if [[ $? -eq 0 ]]
-  then
-    echo ""
-    echo "pylint available, proceeding with pylint sanity check."
-    echo ""
-  else
-    echo ""
-    echo "pylint not available."
-    echo ""
-    return 1
-  fi
-
-  # Configure pylint using the following file
-  PYLINTRC_FILE="${SCRIPT_DIR}/pylintrc"
-
-  if [[ ! -f "${PYLINTRC_FILE}" ]]; then
-    die "ERROR: Cannot find pylint rc file at ${PYLINTRC_FILE}"
-  fi
-
-  # Run pylint in parallel, after some disk setup
-  NUM_SRC_FILES=$(echo ${PYTHON_SRC_FILES} | wc -w)
-  NUM_CPUS=$(num_cpus)
-
-  echo "Running pylint on ${NUM_SRC_FILES} files with ${NUM_CPUS} "\
-"parallel jobs..."
-  echo ""
-
-  PYLINT_START_TIME=$(date +'%s')
-  OUTPUT_FILE="$(mktemp)_pylint_output.log"
-  ERRORS_FILE="$(mktemp)_pylint_errors.log"
-
-  rm -rf ${OUTPUT_FILE}
-  rm -rf ${ERRORS_FILE}
-
-  set +e
-  # When running, filter to only contain the error code lines. Removes module
-  # header, removes lines of context that show up from some lines.
-  # Also, don't redirect stderr as this would hide pylint fatal errors.
-  ${PYLINT_BIN} --rcfile="${PYLINTRC_FILE}" --output-format=parseable \
-      --jobs=${NUM_CPUS} ${PYTHON_SRC_FILES} | grep '\[[CEFW]' > ${OUTPUT_FILE}
-  PYLINT_END_TIME=$(date +'%s')
-
-  echo ""
-  echo "pylint took $((PYLINT_END_TIME - PYLINT_START_TIME)) s"
-  echo ""
-
-  # Report only what we care about
-  # Ref https://pylint.readthedocs.io/en/latest/technical_reference/features.html
-  # E: all errors
-  # W0311 bad-indentation
-  # W0312 mixed-indentation
-  # C0330 bad-continuation
-  # C0301 line-too-long
-  # C0326 bad-whitespace
-  # W0611 unused-import
-  # W0622 redefined-builtin
-  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326|\[W0611|\[W0622)' ${OUTPUT_FILE} > ${ERRORS_FILE}
-
-  # Determine counts of errors
-  N_FORBID_ERRORS=$(wc -l ${ERRORS_FILE} | cut -d' ' -f1)
-  set -e
-
-  # Now, print the errors we should fix
-  echo ""
-  if [[ ${N_FORBID_ERRORS} != 0 ]]; then
-    echo "Found ${N_FORBID_ERRORS} pylint errors:"
-    cat ${ERRORS_FILE}
-  fi
-
-  echo ""
-  if [[ ${N_FORBID_ERRORS} != 0 ]]; then
-    echo "FAIL: Found ${N_FORBID_ERRORS} errors"
-    return 1
-  else
-    echo "PASS: Found no errors"
-  fi
-}
-
-do_pylint "$@"
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/stale.yml b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/stale.yml
deleted file mode 100644
index 7eef5309ecdf53125eb976f90c3b62f1a31a55d4..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/stale.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    # ============================================================================
-    #
-    # THIS IS A GENERATED DOCKERFILE.
-    #
-    # This file was assembled from multiple pieces, whose use is documented
-    # throughout. Please refer to the TensorFlow dockerfiles documentation
-    # for more information.
-
-# Number of days of inactivity before an Issue or Pull Request becomes stale
-daysUntilStale: 7
-# Number of days of inactivity before a stale Issue or Pull Request is closed
-daysUntilClose: 7
-# Only issues or pull requests with all of these labels are checked if stale. Defaults to `[]` (disabled)
-onlyLabels:
- - stat:awaiting response
-# Comment to post when marking as stale. Set to `false` to disable
-markComment: >
-  This issue has been automatically marked as stale because it has not had
-  recent activity. It will be closed if no further activity occurs. Thank you.
-# Comment to post when removing the stale label. Set to `false` to disable
-unmarkComment: false
-closeComment: >
-  Closing as stale. Please reopen if you'd like to work on this further.
-limitPerRun: 30
-# Limit to only `issues` or `pulls`
-only: issues
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/workflows/ci.yml b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/workflows/ci.yml
deleted file mode 100644
index 744f440b053ddb5391a827b5406ddb9ad94eccef..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.github/workflows/ci.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: CI
-on: pull_request
-
-jobs:
-  pylint:
-    runs-on: ubuntu-latest
-    
-    steps:
-      - name: Set up Python 3.8
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.8
-      
-      - name: Install pylint 2.4.4
-        run: |
-          python -m pip install --upgrade pip
-          pip install pylint==2.4.4
-        
-      - name: Checkout code
-        uses: actions/checkout@v2
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-          fetch-depth: 0
-      
-      - name: Fetch master for diff
-        run: git fetch origin master:master
-       
-      - name: Run pylint script
-        run: bash ./.github/scripts/pylint.sh --incremental
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.gitignore b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.gitignore
deleted file mode 100644
index cbc8846d64152b8a933f4bd2727877a94f98f92a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/.gitignore
+++ /dev/null
@@ -1,98 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-env/
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*,cover
-.hypothesis/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# IPython Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# mypy
-.mypy_cache
-
-# celery beat schedule file
-celerybeat-schedule
-
-# dotenv
-.env
-
-# virtualenv
-venv/
-ENV/
-
-# Spyder project settings
-.spyderproject
-
-# Rope project settings
-.ropeproject
-
-# PyCharm
-.idea/
-
-# For mac
-.DS_Store
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/AUTHORS b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/AUTHORS
deleted file mode 100644
index 0fa85c98ffeb38c6d6d0ef2bddb790b75b90f3dc..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/AUTHORS
+++ /dev/null
@@ -1,10 +0,0 @@
-# This is the official list of authors for copyright purposes.
-# This file is distinct from the CONTRIBUTORS files.
-# See the latter for an explanation.
-
-# Names should be added to this file as:
-# Name or Organization <email address>
-# The email address is not required for organizations.
-
-Google Inc.
-David Dao <daviddao@broad.mit.edu>
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/CODEOWNERS b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/CODEOWNERS
deleted file mode 100644
index 9dd84ad290b1f3d4c071a73c51eca5dd5af448dd..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/CODEOWNERS
+++ /dev/null
@@ -1,27 +0,0 @@
-* @tensorflow/tf-garden-team @tensorflow/tf-model-garden-team
-/official/ @rachellj218 @saberkun @jaeyounkim
-/official/nlp/ @saberkun @lehougoogle @rachellj218 @jaeyounkim
-/official/vision/ @xianzhidu @yeqingli @arashwan @saberkun @rachellj218 @jaeyounkim
-/official/vision/beta/projects/assemblenet/ @mryoo
-/official/vision/beta/projects/deepmac_maskrcnn/ @vighneshbirodkar
-/official/vision/beta/projects/movinet/ @hyperparticle @yuanliangzhe @yeqingli
-/official/vision/beta/projects/simclr/ @luotigerlsx @chentingpc @saxenasaurabh
-/research/adversarial_text/ @rsepassi @a-dai
-/research/attention_ocr/ @xavigibert
-/research/audioset/ @plakal @dpwe
-/research/autoaugment/ @barretzoph
-/research/cognitive_planning/ @s-gupta
-/research/cvt_text/ @clarkkev @lmthang
-/research/deep_speech/ @yhliang2018
-/research/deeplab/ @aquariusjay @yknzhu
-/research/delf/ @andrefaraujo
-/research/efficient-hrl/ @ofirnachum
-/research/lfads/ @jazcollins @sussillo
-/research/lstm_object_detection/ @yinxiaoli @yongzhe2160
-/research/marco/ @vincentvanhoucke
-/research/object_detection/ @jch1 @tombstone @pkulzc
-/research/pcl_rl/ @ofirnachum
-/research/rebar/ @gjtucker
-/research/seq_flow_lite/ @thunderfyc
-/research/slim/ @sguada @marksandler2
-/research/vid2depth/ @rezama
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/CONTRIBUTING.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/CONTRIBUTING.md
deleted file mode 100644
index f909461ae7b9c75264e0915ecb37228314933e4a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/CONTRIBUTING.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# How to contribute
-
-![Contributors](https://img.shields.io/github/contributors/tensorflow/models)
-
-We encourage you to contribute to the TensorFlow Model Garden.
-
-Please read our [guidelines](../../wiki/How-to-contribute) for details.
-
-**NOTE**: Only [code owners](./CODEOWNERS) are allowed to merge a pull request.
-Please contact the code owners of each model to merge your pull request.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/ISSUES.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/ISSUES.md
deleted file mode 100644
index b23d6daa1654188d640beb67e6614bd0743f919f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/ISSUES.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# If you open a GitHub issue, here is our policy.
-
-* It must be a **bug**, a **feature request**, or a significant problem
-with **documentation**.
-  * Please send a pull request instead for small documentation fixes.
-* The required form must be filled out.
-* The issue should be related to the repository it is created in.
-
-General help and support should be sought on [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow-model-garden) or other non-GitHub channels.
-
-[![](https://img.shields.io/stackexchange/stackoverflow/t/tensorflow-model-garden)](https://stackoverflow.com/questions/tagged/tensorflow-model-garden)
-
-TensorFlow developers respond to issues.
-We want to focus on work that benefits the whole community such as fixing bugs
-and adding new features.
-It helps us to address bugs and feature requests in a timely manner.
-
---- 
-
-Please understand that research models in the [research directory](https://github.com/tensorflow/models/tree/master/research)
-included in this repository are experimental and research-style code.
-They are not officially supported by the TensorFlow team.
-
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/LICENSE b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/LICENSE
deleted file mode 100644
index 43fcf7bf1f1f9f824a1debf05d6ced45bf5810aa..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Copyright 2016 The TensorFlow Authors.  All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2016, The Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/README.md
deleted file mode 100644
index 7f4a29fb9123c7edc2d627c10fa6c7b0f21f652e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/README.md
+++ /dev/null
@@ -1,217 +0,0 @@
-- [基本信息](#基本信息.md)
-- [概述](#概述.md)
-- [训练环境准备](#训练环境准备.md)
-- [快速上手](#快速上手.md)
-- [迁移学习指导](#迁移学习指导.md)
-- [高级参考](#高级参考.md)
-<h2 id="基本信息.md">基本信息</h2>
-
-**发布者（Publisher）：Huawei**
-
-**应用领域（Application Domain）：Natural Language Processing**
-
-**版本（Version）：1.1**
-
-**修改时间（Modified） ：2022.6.11**
-
-**大小（Size）：44KB**
-
-**框架（Framework）：TensorFlow_2.6.2**
-
-**模型格式（Model Format）：ckpt**
-
-**精度（Precision）：Mixed**
-
-**处理器（Processor）：昇腾910**
-
-**应用级别（Categories）：Official**
-
-**描述（Description）：基于TensorFlow框架的BertLarge自然语言处理网络训练代码**
-
-<h2 id="概述.md">概述</h2>
-
-## 简述<a name="section194554031510"></a>
-
-BERT是一种预训练语言表示方法,是第一种用于预训练NLP的无监督、深度双向系统。这里我们介绍了BERT的Fine-tuning任务,通过对BERT进行微调,在squad数据集上进行预测和问答。
-- 参考论文：
-
-  [https://arxiv.org/abs/1810.04805](https://gitee.com/link?target=https%3A%2F%2Farxiv.org%2Fabs%2F1810.04805)
-
-- 参考实现：
-
-  https://github.com/tensorflow/models/tree/r2.6.0/official/nlp/bert
-
-- 适配昇腾 AI 处理器的实现：
-  
-  https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X
-
-- 通过Git获取对应commit\_id的代码方法如下：
-  
-        git clone {repository_url}    # 克隆仓库的代码
-        cd {repository_name}    # 切换到模型的代码仓目录
-        git checkout  {branch}    # 切换到对应分支
-        git reset --hard ｛commit_id｝     # 代码设置到对应的commit_id
-        cd ｛code_path｝    # 切换到模型代码所在路径，若仓库下只有该模型，则无需切换
-    
-
-## 默认配置<a name="section91661242121611"></a>
-
--   网络结构
-    - 24-layer, 1024-hidden, 16-heads, 340M parameters
--   训练超参（单卡）：
-    - Batch size: 24
-    - max_predictions_per_seq: 76
-    - max_seq_length: 384
-    - Learning rate(LR): 8e-5
-    - Weight decay: 0.01
-    - Train epoch: 2
-
-
-## 支持特性<a name="section1899153513554"></a>
-
-| 特性列表   | 是否支持 |
-| ---------- | -------- |
-| 分布式训练 | 否       |
-| 混合精度   | 是       |
-| 数据并行   | 是       |
-
-
-## 混合精度训练<a name="section168064817164"></a>
-
-昇腾910 AI处理器提供自动混合精度功能，可以针对全网中float32数据类型的算子，按照内置的优化策略，自动将部分float32的算子降低精度到float16，从而在精度损失很小的情况下提升系统性能并减少内存使用。
-
-## 开启混合精度<a name="section20779114113713"></a>
-
-拉起脚本中，传入--precision_mode='allow_mix_precision'
-
-```
- ./train_performance_squad1.1_large_1p.sh --help
-
-parameter explain:
-    --precision_mode         precision mode(allow_fp32_to_fp16/force_fp16/must_keep_origin_dtype/allow_mix_precision)
-    --over_dump                  if or not over detection, default is False
-    --data_dump_flag         data dump flag, default is False
-    --data_dump_step             data dump step, default is 10
-    --profiling                  if or not profiling for performance debug, default is False
-    --data_path                  source data of training
-    -h/--help                    show help message
-```
-
-相关代码示例:
-
-```
-flags.DEFINE_string(name='precision_mode', default= 'allow_fp32_to_fp16',
-                    help='allow_fp32_to_fp16/force_fp16/ ' 
-                    'must_keep_origin_dtype/allow_mix_precision.')
-
-npu_device.global_options().precision_mode=FLAGS.precision_mode
-```
-
-<h2 id="训练环境准备.md">训练环境准备</h2>
-
--  硬件环境和运行环境准备请参见《[CANN软件安装指南](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=installation-update)》
--  运行以下命令安装依赖。
-```
-pip3 install requirements.txt
-```
-说明：依赖配置文件requirements.txt文件位于模型的根目录
-
-<h2 id="快速上手.md">快速上手</h2>
-
-## 数据集准备<a name="section361114841316"></a>
-
-1、用户自行准备好数据集，本网络只包括Bert的Fine tuning任务
-
-2、使用的数据集是SQuAD 1.1 和 SQuAD 2.0
-
-3、Bert训练的模型及数据集可以参考"简述 -> 参考实现"
-
-
-
-## 模型训练<a name="section715881518135"></a>
-
-- 单击“立即下载”，并选择合适的下载方式下载源码包。
-- 开始训练。
-
-    1. 启动训练之前，首先要配置程序运行相关环境变量。
-
-    	环境变量配置信息参见：
-
-       [Ascend 910训练平台环境变量设置](https://gitee.com/ascend/ModelZoo-TensorFlow/wikis/01.%E8%AE%AD%E7%BB%83%E8%84%9A%E6%9C%AC%E8%BF%81%E7%A7%BB%E6%A1%88%E4%BE%8B/Ascend%20910%E8%AE%AD%E7%BB%83%E5%B9%B3%E5%8F%B0%E7%8E%AF%E5%A2%83%E5%8F%98%E9%87%8F%E8%AE%BE%E7%BD%AE)
-       
-    2. 单卡训练
-
-          2.1 Fine tuning单卡任务训练指令（脚本位于BERT_ID2478_for_TensorFlow2.X/test/train_performance_squad1.1_large_1p.sh）,需要先使用cd命令进入test目录下，再使用下面的命令启动训练。请确保下面例子中的“--data_path”修改为用户的数据路径,这里选择将数据文件夹放在home目录下。
-
-        - SQuAD1.1
-        
-          ```
-          # bertbase模型
-          bash train_performance_squad1.1_base_1p.sh --data_path=/home
-  
-          # bertlarge模型
-          bash train_performance_squad1.1_large_1p.sh --data_path=/home
-          ```
-        
-          
-        
-        - SQuAD2.0
-        
-          ```
-          # bertbase模型
-          bash train_performance_squad2.0_base_1p.sh --data_path=/home
-          
-          # bertlarge模型
-          bash train_performance_squad2.0_large_1p.sh --data_path=/home
-          ```
-        
-          
-
-
-<h2 id="高级参考.md">高级参考</h2>
-
-## 脚本和示例代码
-
-```
-|--LICENSE
-|--README.md									                #说明文档
-|--model_training_utils.py
-|--squad_evaluate_v2_0.py
-|--squad_evaluate_v1_1.py
-|--run_squad.py     									        #训练代码
-|--requirements.txt		   						                #所需依赖
-|--utils.py
-|--test			           						                #训练脚本目录
-|	|--train_full_squad1.1_large_1p.sh							#全量训练脚本
-|	|--train_performance_squad1.1_large_1p.sh					#performance训练脚本
-```
-
-## 脚本参数<a name="section6669162441511"></a>
-
-```
- --data_path						# the path to train data
---epochs						# epochs of training
---input_meta_data_path
---train_data_path
---predict_file
---vocab_file
---ckpt_save_path				# directory to ckpt
---batch_size					# batch size for 1p
---log_steps						# log frequency
---precision_mode				# the path to save over dump data
---over_dump						# if or not over detection, default is False
---data_dump_flag				# data dump flag, default is False
---data_dump_step				# data dump step, default is 10
---profiling						# if or not profiling for performance debug, default is False
---profiling_dump_path			# the path to save profiling data
---over_dump_path				# the path to save over dump data
---data_dump_path				# the path to save dump data
---use_mixlist					# use_mixlist flag, default is False
---fusion_off_flag				# fusion_off flag, default is False
---mixlist_file					# mixlist file name, default is ops_info.json
---fusion_off_file				# fusion_off file name, default is fusion_switch.cfg
-```
-
-## 训练过程<a name="section1589455252218"></a>
-
-通过“模型训练”中的训练指令启动单卡或者多卡训练。单卡和多卡通过运行不同脚本，支持单卡，8卡网络训练。模型存储路径为${cur_path}/output/$ASCEND_DEVICE_ID，包括训练的log以及checkpoints文件。以8卡训练为例，loss信息在文件${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log中。
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/README_ORI.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/README_ORI.md
deleted file mode 100644
index 9e4a1641386f0f8d0ce0de9d6ef59e23a35475ab..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/README_ORI.md
+++ /dev/null
@@ -1,25 +0,0 @@
-![Logo](https://storage.googleapis.com/tf_model_garden/tf_model_garden_logo.png)
-
-# Welcome to the Model Garden for TensorFlow
-
-The TensorFlow Model Garden is a repository with a number of different implementations of state-of-the-art (SOTA) models and modeling solutions for TensorFlow users. We aim to demonstrate the best practices for modeling so that TensorFlow users
-can take full advantage of TensorFlow for their research and product development.
-
-| Directory | Description |
-|-----------|-------------|
-| [official](official) | • A collection of example implementations for SOTA models using the latest TensorFlow 2's high-level APIs<br />• Officially maintained, supported, and kept up to date with the latest TensorFlow 2 APIs by TensorFlow<br />• Reasonably optimized for fast performance while still being easy to read |
-| [research](research) | • A collection of research model implementations in TensorFlow 1 or 2 by researchers<br />• Maintained and supported by researchers |
-| [community](community) | • A curated list of the GitHub repositories with machine learning models and implementations powered by TensorFlow 2 |
-| [orbit](orbit) | • A flexible and lightweight library that users can easily use or fork when writing customized training loop code in TensorFlow 2.x. It seamlessly integrates with `tf.distribute` and supports running on different device types (CPU, GPU, and TPU). |
-
-## [Announcements](https://github.com/tensorflow/models/wiki/Announcements)
-
-## Contributions
-
-[![help wanted:paper implementation](https://img.shields.io/github/issues/tensorflow/models/help%20wanted%3Apaper%20implementation)](https://github.com/tensorflow/models/labels/help%20wanted%3Apaper%20implementation)
-
-If you want to contribute, please review the [contribution guidelines](https://github.com/tensorflow/models/wiki/How-to-contribute).
-
-## License
-
-[Apache License 2.0](LICENSE)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/community/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/community/README.md
deleted file mode 100644
index b3f2bac74897d440d7e16efadcae45b8b5e46249..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/community/README.md
+++ /dev/null
@@ -1,60 +0,0 @@
-![Logo](https://storage.googleapis.com/tf_model_garden/tf_model_garden_logo.png)
-
-# TensorFlow Community Models
-
-This repository provides a curated list of the GitHub repositories with machine learning models and implementations powered by TensorFlow 2.
-
-**Note**: Contributing companies or individuals are responsible for maintaining their repositories.
-
-## Computer Vision
-
-### Image Recognition
-
-| Model | Paper | Features | Maintainer |
-|-------|-------|----------|------------|
-| [DenseNet 169](https://github.com/IntelAI/models/tree/master/benchmarks/image_recognition/tensorflow/densenet169) | [Densely Connected Convolutional Networks](https://arxiv.org/pdf/1608.06993) | • FP32 Inference | [Intel](https://github.com/IntelAI) |
-| [Inception V3](https://github.com/IntelAI/models/tree/master/benchmarks/image_recognition/tensorflow/inceptionv3) | [Rethinking the Inception Architecture<br/>for Computer Vision](https://arxiv.org/pdf/1512.00567.pdf) | • Int8 Inference<br/>• FP32 Inference | [Intel](https://github.com/IntelAI) |
-| [Inception V4](https://github.com/IntelAI/models/tree/master/benchmarks/image_recognition/tensorflow/inceptionv4) | [Inception-v4, Inception-ResNet and the Impact<br/>of Residual Connections on Learning](https://arxiv.org/pdf/1602.07261) | • Int8 Inference<br/>• FP32 Inference | [Intel](https://github.com/IntelAI) |
-| [MobileNet V1](https://github.com/IntelAI/models/tree/master/benchmarks/image_recognition/tensorflow/mobilenet_v1) | [MobileNets: Efficient Convolutional Neural Networks<br/>for Mobile Vision Applications](https://arxiv.org/pdf/1704.04861) | • Int8 Inference<br/>• FP32 Inference | [Intel](https://github.com/IntelAI) |
-| [ResNet 101](https://github.com/IntelAI/models/tree/master/benchmarks/image_recognition/tensorflow/resnet101) | [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385) | • Int8 Inference<br/>• FP32 Inference | [Intel](https://github.com/IntelAI) |
-| [ResNet 50](https://github.com/IntelAI/models/tree/master/benchmarks/image_recognition/tensorflow/resnet50) | [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385) | • Int8 Inference<br/>• FP32 Inference | [Intel](https://github.com/IntelAI) |
-| [ResNet 50v1.5](https://github.com/IntelAI/models/tree/master/benchmarks/image_recognition/tensorflow/resnet50v1_5) | [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385) | • Int8 Inference<br/>• FP32 Inference<br/>• FP32 Training | [Intel](https://github.com/IntelAI) |
-| [EfficientNet](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Classification/ConvNets/efficientnet) | [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/pdf/1905.11946.pdf) | • Automatic mixed precision<br/>• Horovod Multi-GPU training (NCCL)<br/>• Multi-node training on a Pyxis/Enroot Slurm cluster<br/>• XLA | [NVIDIA](https://github.com/NVIDIA) |
-
-### Object Detection
-
-| Model | Paper | Features | Maintainer |
-|-------|-------|----------|------------|
-| [R-FCN](https://github.com/IntelAI/models/tree/master/benchmarks/object_detection/tensorflow/rfcn) | [R-FCN: Object Detection<br/>via Region-based Fully Convolutional Networks](https://arxiv.org/pdf/1605.06409) | • Int8 Inference<br/>• FP32 Inference | [Intel](https://github.com/IntelAI) |
-| [SSD-MobileNet](https://github.com/IntelAI/models/tree/master/benchmarks/object_detection/tensorflow/ssd-mobilenet) | [MobileNets: Efficient Convolutional Neural Networks<br/>for Mobile Vision Applications](https://arxiv.org/pdf/1704.04861) | • Int8 Inference<br/>• FP32 Inference | [Intel](https://github.com/IntelAI) |
-| [SSD-ResNet34](https://github.com/IntelAI/models/tree/master/benchmarks/object_detection/tensorflow/ssd-resnet34) | [SSD: Single Shot MultiBox Detector](https://arxiv.org/pdf/1512.02325) | • Int8 Inference<br/>• FP32 Inference<br/>• FP32 Training | [Intel](https://github.com/IntelAI) |
-
-### Segmentation
-
-| Model | Paper | Features | Maintainer |
-|-------|-------|----------|------------|
-| [Mask R-CNN](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Segmentation/MaskRCNN) | [Mask R-CNN](https://arxiv.org/abs/1703.06870) | • Automatic Mixed Precision<br/>• Multi-GPU training support with Horovod<br/>• TensorRT | [NVIDIA](https://github.com/NVIDIA) |
-| [U-Net Medical Image Segmentation](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Segmentation/UNet_Medical) | [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597) | • Automatic Mixed Precision<br/>• Multi-GPU training support with Horovod<br/>• TensorRT | [NVIDIA](https://github.com/NVIDIA) |
-
-## Natural Language Processing
-
-| Model | Paper | Features | Maintainer |
-|-------|-------|----------|------------|
-| [BERT](https://github.com/IntelAI/models/tree/master/benchmarks/language_modeling/tensorflow/bert_large) | [BERT: Pre-training of Deep Bidirectional Transformers<br/>for Language Understanding](https://arxiv.org/pdf/1810.04805) | • FP32 Inference<br/>• FP32 Training | [Intel](https://github.com/IntelAI) |
-| [BERT](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/LanguageModeling/BERT) | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805) | • Horovod Multi-GPU<br/>• Multi-node with Horovod and Pyxis/Enroot Slurm cluster<br/>• XLA<br/>• Automatic mixed precision<br/>• LAMB | [NVIDIA](https://github.com/NVIDIA) |
-| [ELECTRA](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/LanguageModeling/ELECTRA) | [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/forum?id=r1xMH1BtvB) | • Automatic Mixed Precision<br/>• Multi-GPU training support with Horovod<br/>• Multi-node training on a Pyxis/Enroot Slurm cluster | [NVIDIA](https://github.com/NVIDIA) |
-| [GNMT](https://github.com/IntelAI/models/tree/master/benchmarks/language_translation/tensorflow/mlperf_gnmt) | [Google’s Neural Machine Translation System:<br/>Bridging the Gap between Human and Machine Translation](https://arxiv.org/pdf/1609.08144) | • FP32 Inference | [Intel](https://github.com/IntelAI) |
-| [Transformer-LT (Official)](https://github.com/IntelAI/models/tree/master/benchmarks/language_translation/tensorflow/transformer_lt_official) | [Attention Is All You Need](https://arxiv.org/pdf/1706.03762) | • FP32 Inference | [Intel](https://github.com/IntelAI) |
-| [Transformer-LT (MLPerf)](https://github.com/IntelAI/models/tree/master/benchmarks/language_translation/tensorflow/transformer_mlperf) | [Attention Is All You Need](https://arxiv.org/pdf/1706.03762) | • FP32 Training | [Intel](https://github.com/IntelAI) |
-
-## Recommendation Systems
-
-| Model | Paper | Features | Maintainer |
-|-------|-------|----------|------------|
-| [Wide & Deep](https://github.com/IntelAI/models/tree/master/benchmarks/recommendation/tensorflow/wide_deep_large_ds) | [Wide & Deep Learning for Recommender Systems](https://arxiv.org/pdf/1606.07792) | • FP32 Inference<br/>• FP32 Training | [Intel](https://github.com/IntelAI) |
-| [Wide & Deep](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Recommendation/WideAndDeep) | [Wide & Deep Learning for Recommender Systems](https://arxiv.org/pdf/1606.07792) | • Automatic mixed precision<br/>• Multi-GPU training support with Horovod<br/>• XLA | [NVIDIA](https://github.com/NVIDIA) |
-| [DLRM](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Recommendation/DLRM) | [Deep Learning Recommendation Model for Personalization and Recommendation Systems](https://arxiv.org/pdf/1906.00091.pdf) | • Automatic Mixed Precision<br/>• Hybrid-parallel multiGPU training using Horovod all2all<br/>• Multinode training for Pyxis/Enroot Slurm clusters<br/>• XLA<br/>• Criteo dataset preprocessing with Spark on GPU | [NVIDIA](https://github.com/NVIDIA) |
-
-## Contributions
-
-If you want to contribute, please review the [contribution guidelines](https://github.com/tensorflow/models/wiki/How-to-contribute).
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/configs/ops_info.json b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/configs/ops_info.json
deleted file mode 100644
index 5dc3564522f36d625db1cc85f8cb1fd72967bbe6..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/configs/ops_info.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "black-list": {
-    "to-add": [
-      "Cast",
-      "SoftmaxV2"
-    ]
-  }
-}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/modelzoo_level.txt b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/modelzoo_level.txt
deleted file mode 100644
index 31529da2e68f25b61e2a3e698a07537281443c03..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/modelzoo_level.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-FuncStatus:OK
-PerfStatus:OK
-PrecisionStatus:OK
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/LICENSE b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/LICENSE
deleted file mode 100644
index d3da228420e973edaf4123d5eeb42210f4450b0c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-Copyright 2015 The TensorFlow Authors.  All rights reserved.
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2015, The TensorFlow Authors.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/README-TPU.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/README-TPU.md
deleted file mode 100644
index 28a5a0a73d210e9fe6e00db38d0e911e3d771ddf..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/README-TPU.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# Offically Supported TensorFlow 2.1+ Models on Cloud TPU
-
-## Natural Language Processing
-
-*   [bert](nlp/bert): A powerful pre-trained language representation model:
-    BERT, which stands for Bidirectional Encoder Representations from
-    Transformers.
-    [BERT FineTuning with Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/bert-2.x) provides step by step instructions on Cloud TPU training. You can look [Bert MNLI Tensorboard.dev metrics](https://tensorboard.dev/experiment/LijZ1IrERxKALQfr76gndA) for MNLI fine tuning task.
-*   [transformer](nlp/transformer): A transformer model to translate the WMT
-    English to German dataset.
-        [Training transformer on Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/transformer-2.x) for step by step instructions on Cloud TPU training.
-
-## Computer Vision
-
-*   [efficientnet](vision/image_classification): A family of convolutional
-    neural networks that scale by balancing network depth, width, and
-    resolution and can be used to classify ImageNet's dataset of 1000 classes.
-    See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/KnaWjrq5TXGfv0NW5m7rpg/#scalars).
-*   [mnist](vision/image_classification): A basic model to classify digits
-    from the MNIST dataset. See [Running MNIST on Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/mnist-2.x) tutorial and [Tensorboard.dev metrics](https://tensorboard.dev/experiment/mIah5lppTASvrHqWrdr6NA).
-*   [mask-rcnn](vision/detection): An object detection and instance segmentation model. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/LH7k0fMsRwqUAcE09o9kPA).
-*   [resnet](vision/image_classification): A deep residual network that can
-    be used to classify ImageNet's dataset of 1000 classes.
-    See [Training ResNet on Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/resnet-2.x) tutorial and [Tensorboard.dev metrics](https://tensorboard.dev/experiment/CxlDK8YMRrSpYEGtBRpOhg).
-*   [retinanet](vision/detection): A fast and powerful object detector. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/b8NRnWU3TqG6Rw0UxueU6Q).
-*   [shapemask](vision/detection): An object detection and instance segmentation model using shape priors. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/ZbXgVoc6Rf6mBRlPj0JpLA).
-
-## Recommendation
-*   [ncf](recommendation): Neural Collaborative Filtering. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/0k3gKjZlR1ewkVTRyLB6IQ).
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/README.md
deleted file mode 100644
index c53decf083e302896fc4a7a92525cb2128ef6352..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/README.md
+++ /dev/null
@@ -1,177 +0,0 @@
-![Logo](https://storage.googleapis.com/model_garden_artifacts/TF_Model_Garden.png)
-
-# TensorFlow Official Models
-
-The TensorFlow official models are a collection of models
-that use TensorFlow’s high-level APIs.
-They are intended to be well-maintained, tested, and kept up to date
-with the latest TensorFlow API.
-
-They should also be reasonably optimized for fast performance while still
-being easy to read.
-These models are used as end-to-end tests, ensuring that the models run
-with the same or improved speed and performance with each new TensorFlow build.
-
-## More models to come!
-
-The team is actively developing new models.
-In the near future, we will add:
-
-* State-of-the-art language understanding models.
-* State-of-the-art image classification models.
-* State-of-the-art object detection and instance segmentation models.
-
-## Table of Contents
-
-- [Models and Implementations](#models-and-implementations)
-  * [Computer Vision](#computer-vision)
-    + [Image Classification](#image-classification)
-    + [Object Detection and Segmentation](#object-detection-and-segmentation)
-  * [Natural Language Processing](#natural-language-processing)
-  * [Recommendation](#recommendation)
-- [How to get started with the official models](#how-to-get-started-with-the-official-models)
-
-## Models and Implementations
-
-### Computer Vision
-
-#### Image Classification
-
-| Model | Reference (Paper) |
-|-------|-------------------|
-| [MNIST](vision/image_classification) | A basic model to classify digits from the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) |
-| [ResNet](vision/image_classification) | [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) |
-| [ResNet-RS](vision/beta/MODEL_GARDEN.md) | [Revisiting ResNets: Improved Training and Scaling Strategies](https://arxiv.org/abs/2103.07579) |
-| [EfficientNet](vision/image_classification) | [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) |
-
-#### Object Detection and Segmentation
-
-| Model | Reference (Paper) |
-|-------|-------------------|
-| [RetinaNet](vision/detection) | [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) |
-| [Mask R-CNN](vision/detection) | [Mask R-CNN](https://arxiv.org/abs/1703.06870) |
-| [ShapeMask](vision/detection) | [ShapeMask: Learning to Segment Novel Objects by Refining Shape Priors](https://arxiv.org/abs/1904.03239) |
-| [SpineNet](vision/detection) | [SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization](https://arxiv.org/abs/1912.05027) |
-
-### Natural Language Processing
-
-| Model | Reference (Paper) |
-|-------|-------------------|
-| [ALBERT (A Lite BERT)](nlp/albert) | [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942) |
-| [BERT (Bidirectional Encoder Representations from Transformers)](nlp/bert) | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) |
-| [NHNet (News Headline generation model)](nlp/nhnet) | [Generating Representative Headlines for News Stories](https://arxiv.org/abs/2001.09386) |
-| [Transformer](nlp/transformer) | [Attention Is All You Need](https://arxiv.org/abs/1706.03762) |
-| [XLNet](nlp/xlnet) | [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) |
-| [MobileBERT](nlp/projects/mobilebert) | [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) |
-
-### Recommendation
-
-| Model | Reference (Paper) |
-|-------|-------------------|
-| [NCF](recommendation) | [Neural Collaborative Filtering](https://arxiv.org/abs/1708.05031) |
-
-## How to get started with the official models
-
-* The models in the master branch are developed using TensorFlow 2,
-and they target the TensorFlow [nightly binaries](https://github.com/tensorflow/tensorflow#installation)
-built from the
-[master branch of TensorFlow](https://github.com/tensorflow/tensorflow/tree/master).
-* The stable versions targeting releases of TensorFlow are available
-as tagged branches or [downloadable releases](https://github.com/tensorflow/models/releases).
-* Model repository version numbers match the target TensorFlow release,
-such that
-[release v2.2.0](https://github.com/tensorflow/models/releases/tag/v2.2.0)
-are compatible with
-[TensorFlow v2.2.0](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0).
-
-Please follow the below steps before running models in this repository.
-
-### Requirements
-
-* The latest TensorFlow Model Garden release and TensorFlow 2
-  * If you are on a version of TensorFlow earlier than 2.2, please
-upgrade your TensorFlow to [the latest TensorFlow 2](https://www.tensorflow.org/install/).
-
-```shell
-pip3 install tf-nightly
-```
-
-### Installation
-
-#### Method 1: Install the TensorFlow Model Garden pip package
-
-**tf-models-official** is the stable Model Garden package.
-pip will install all models and dependencies automatically.
-
-```shell
-pip install tf-models-official
-```
-
-If you are using nlp packages, please also install **tensorflow-text**:
-
-```shell
-pip install tensorflow-text
-```
-
-Please check out our [example](colab/fine_tuning_bert.ipynb)
-to learn how to use a PIP package.
-
-Note that **tf-models-official** may not include the latest changes in this
-github repo. To include latest changes, you may install **tf-models-nightly**,
-which is the nightly Model Garden package created daily automatically.
-
-```shell
-pip install tf-models-nightly
-```
-
-#### Method 2: Clone the source
-
-1. Clone the GitHub repository:
-
-```shell
-git clone https://github.com/tensorflow/models.git
-```
-
-2. Add the top-level ***/models*** folder to the Python path.
-
-```shell
-export PYTHONPATH=$PYTHONPATH:/path/to/models
-```
-
-If you are using a Colab notebook, please set the Python path with os.environ.
-
-```python
-import os
-os.environ['PYTHONPATH'] += ":/path/to/models"
-```
-
-3. Install other dependencies
-
-```shell
-pip3 install --user -r official/requirements.txt
-```
-
-Finally, if you are using nlp packages, please also install
-**tensorflow-text-nightly**:
-
-```shell
-pip3 install tensorflow-text-nightly
-```
-
-## Contributions
-
-If you want to contribute, please review the [contribution guidelines](https://github.com/tensorflow/models/wiki/How-to-contribute).
-
-## Citing TF Official Model Garden
-
-To cite this repository:
-
-```
-@software{tfmodels2020github,
-  author = {Chen Chen and Xianzhi Du and Le Hou and Jaeyoun Kim and Jing Li and
-  Yeqing Li and Abdullah Rashwan and Fan Yang and Hongkun Yu},
-  title = {TensorFlow Official Model Garden},
-  url = {https://github.com/tensorflow/models/tree/master/official},
-  year = {2020},
-}
-```
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/__init__.py
deleted file mode 100644
index 9772d6bd74cf0348a137ea4bce7fe8bd29ac9ca1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/decoding_api_in_tf_nlp.ipynb b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/decoding_api_in_tf_nlp.ipynb
deleted file mode 100644
index 726b382e228265fa1e19c2af3150e7cc32a0ec56..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/decoding_api_in_tf_nlp.ipynb
+++ /dev/null
@@ -1,492 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "vXLA5InzXydn"
-      },
-      "source": [
-        "##### Copyright 2021 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "RuRlpLL-X0R_"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fsACVQpVSifi"
-      },
-      "source": [
-        "### Install the TensorFlow Model Garden pip package\n",
-        "\n",
-        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
-        "which is the nightly Model Garden package created daily automatically.\n",
-        "*  pip will install all models and dependencies automatically."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hYEwGTeCXnnX"
-      },
-      "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/tutorials/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/decoding_api_in_tf_nlp.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "2j-xhrsVQOQT"
-      },
-      "outputs": [],
-      "source": [
-        "pip install  tf-models-nightly"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "BjP7zwxmskpY"
-      },
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "\n",
-        "import numpy as np\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "import tensorflow as tf\n",
-        "\n",
-        "from official import nlp\n",
-        "from official.nlp.modeling.ops import sampling_module\n",
-        "from official.nlp.modeling.ops import beam_search"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0AWgyo-IQ5sP"
-      },
-      "source": [
-        "# Decoding API\n",
-        "This API provides an interface to experiment with different decoding strategies used for auto-regressive models.\n",
-        "\n",
-        "1. The following sampling strategies are provided in sampling_module.py, which inherits from the base Decoding class:\n",
-        "  *   [top_p](https://arxiv.org/abs/1904.09751) : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L65) \n",
-        "\n",
-        "      This implementation chooses most probable logits with cumulative probabilities upto top_p.\n",
-        "\n",
-        "  *   [top_k](https://arxiv.org/pdf/1805.04833.pdf) : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L48)\n",
-        "\n",
-        "      At each timestep, this implementation samples from top-k logits based on their probability distribution\n",
-        "\n",
-        "  *   Greedy : [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/sampling_module.py#L26)\n",
-        "\n",
-        "      This implementation returns the top logits based on probabilities.\n",
-        "\n",
-        "2. Beam search is provided in beam_search.py. [github](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/beam_search.py)\n",
-        "\n",
-        "      This implementation reduces the risk of missing hidden high probability logits by keeping the most likely num_beams of logits at each time step and eventually choosing the logits that has the overall highest probability."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MfOj7oaBRQnS"
-      },
-      "source": [
-        "## Initialize Sampling Module in TF-NLP.\n",
-        "\n",
-        "\n",
-        "\u003e **symbols_to_logits_fn** : This is a closure implemented by the users of the API. The input to this closure will be  \n",
-        "```\n",
-        "Args:\n",
-        "  1] ids [batch_size, .. (index + 1 or 1 if padded_decode is True)],\n",
-        "  2] index [scalar] : current decoded step,\n",
-        "  3] cache [nested dictionary of tensors].\n",
-        "Returns:\n",
-        "  1] tensor for next-step logits [batch_size, vocab]\n",
-        "  2] the updated_cache [nested dictionary of tensors].\n",
-        "```\n",
-        "This closure calls the model to predict the logits for the 'index+1' step. The cache is used for faster decoding.\n",
-        "Here is a [reference](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/ops/beam_search_test.py#L88) implementation for the above closure.\n",
-        "\n",
-        "\n",
-        "\u003e **length_normalization_fn** : Closure for returning length normalization parameter.\n",
-        "```\n",
-        "Args: \n",
-        "  1] length : scalar for decoded step index.\n",
-        "  2] dtype : data-type of output tensor\n",
-        "Returns:\n",
-        "  1] value of length normalization factor.\n",
-        "Example :\n",
-        "  def _length_norm(length, dtype):\n",
-        "    return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)\n",
-        "```\n",
-        "\n",
-        "\u003e **vocab_size** : Output vocabulary size.\n",
-        "\n",
-        "\u003e **max_decode_length** : Scalar for total number of decoding steps.\n",
-        "\n",
-        "\u003e **eos_id** : Decoding will stop if all output decoded ids in the batch have this ID.\n",
-        "\n",
-        "\u003e **padded_decode** : Set this to True if running on TPU. Tensors are padded to max_decoding_length if this is True.\n",
-        "\n",
-        "\u003e **top_k** : top_k is enabled if this value is \u003e 1.\n",
-        "\n",
-        "\u003e **top_p** : top_p is enabled if this value is \u003e 0 and \u003c 1.0\n",
-        "\n",
-        "\u003e **sampling_temperature** : This is used to re-estimate the softmax output. Temperature skews the distribution towards high probability tokens and lowers the mass in tail distribution. Value has to be positive. Low temperature is equivalent to greedy and makes the distribution sharper, while high temperature makes it more flat.\n",
-        "\n",
-        "\u003e **enable_greedy** : By default, this is true and greedy decoding is enabled.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "lV1RRp6ihnGX"
-      },
-      "source": [
-        "# Initialize the Model Hyper-parameters"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "eTsGp2gaKLdE"
-      },
-      "outputs": [],
-      "source": [
-        "params = {}\n",
-        "params['num_heads'] = 2\n",
-        "params['num_layers'] = 2\n",
-        "params['batch_size'] = 2\n",
-        "params['n_dims'] = 256\n",
-        "params['max_decode_length'] = 4"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UGvmd0_dRFYI"
-      },
-      "source": [
-        "## What is a Cache?\n",
-        "In auto-regressive architectures like Transformer based [Encoder-Decoder](https://arxiv.org/abs/1706.03762) models, \n",
-        "Cache is used for fast sequential decoding.\n",
-        "It is a nested dictionary storing pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) for every layer.\n",
-        "\n",
-        "```\n",
-        "{\n",
-        "    'layer_%d' % layer: {\n",
-        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32),\n",
-        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32)\n",
-        "        } for layer in range(params['num_layers']),\n",
-        "    'model_specific_item' : Model specific tensor shape,\n",
-        "}\n",
-        "\n",
-        "```"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "CYXkoplAij01"
-      },
-      "source": [
-        "# Initialize cache. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "D6kfZOOKgkm1"
-      },
-      "outputs": [],
-      "source": [
-        "cache = {\n",
-        "    'layer_%d' % layer: {\n",
-        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32),\n",
-        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']/params['num_heads']], dtype=tf.float32)\n",
-        "        } for layer in range(params['num_layers'])\n",
-        "    }\n",
-        "print(\"cache key shape for layer 1 :\", cache['layer_1']['k'].shape)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nNY3Xn8SiblP"
-      },
-      "source": [
-        "# Define closure for length normalization. **optional.**\n",
-        "\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "T92ccAzlnGqh"
-      },
-      "outputs": [],
-      "source": [
-        "def length_norm(length, dtype):\n",
-        "  \"\"\"Return length normalization factor.\"\"\"\n",
-        "  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "syl7I5nURPgW"
-      },
-      "source": [
-        "# Create model_fn\n",
-        "  In practice, this will be replaced by an actual model implementation such as [here](https://github.com/tensorflow/models/blob/master/official/nlp/transformer/transformer.py#L236)\n",
-        "```\n",
-        "Args:\n",
-        "i : Step that is being decoded.\n",
-        "Returns:\n",
-        "  logit probabilities of size [batch_size, 1, vocab_size]\n",
-        "```\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "AhzSkRisRdB6"
-      },
-      "outputs": [],
-      "source": [
-        "probabilities = tf.constant([[[0.3, 0.4, 0.3], [0.3, 0.3, 0.4],\n",
-        "                              [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]],\n",
-        "                            [[0.2, 0.5, 0.3], [0.2, 0.7, 0.1],\n",
-        "                              [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]]])\n",
-        "def model_fn(i):\n",
-        "  return probabilities[:, i, :]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "DBMUkaVmVZBg"
-      },
-      "source": [
-        "# Initialize symbols_to_logits_fn\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "FAJ4CpbfVdjr"
-      },
-      "outputs": [],
-      "source": [
-        "def _symbols_to_logits_fn():\n",
-        "  \"\"\"Calculates logits of the next tokens.\"\"\"\n",
-        "  def symbols_to_logits_fn(ids, i, temp_cache):\n",
-        "    del ids\n",
-        "    logits = tf.cast(tf.math.log(model_fn(i)), tf.float32)\n",
-        "    return logits, temp_cache\n",
-        "  return symbols_to_logits_fn"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "R_tV3jyWVL47"
-      },
-      "source": [
-        "# Greedy \n",
-        "Greedy decoding selects the token id with the highest probability as its next id: $id_t = argmax_{w}P(id | id_{1:t-1})$ at each timestep $t$. The following sketch shows greedy decoding. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "aGt9idSkVQEJ"
-      },
-      "outputs": [],
-      "source": [
-        "greedy_obj = sampling_module.SamplingModule(\n",
-        "    length_normalization_fn=None,\n",
-        "    dtype=tf.float32,\n",
-        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
-        "    vocab_size=3,\n",
-        "    max_decode_length=params['max_decode_length'],\n",
-        "    eos_id=10,\n",
-        "    padded_decode=False)\n",
-        "ids, _ = greedy_obj.generate(\n",
-        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
-        "print(\"Greedy Decoded Ids:\", ids)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "s4pTTsQXVz5O"
-      },
-      "source": [
-        "# top_k sampling\n",
-        "In *Top-K* sampling, the *K* most likely next token ids are filtered and the probability mass is redistributed among only those *K* ids. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "pCLWIn6GV5_G"
-      },
-      "outputs": [],
-      "source": [
-        "top_k_obj = sampling_module.SamplingModule(\n",
-        "    length_normalization_fn=length_norm,\n",
-        "    dtype=tf.float32,\n",
-        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
-        "    vocab_size=3,\n",
-        "    max_decode_length=params['max_decode_length'],\n",
-        "    eos_id=10,\n",
-        "    sample_temperature=tf.constant(1.0),\n",
-        "    top_k=tf.constant(3),\n",
-        "    padded_decode=False,\n",
-        "    enable_greedy=False)\n",
-        "ids, _ = top_k_obj.generate(\n",
-        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
-        "print(\"top-k sampled Ids:\", ids)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Jp3G-eE_WI4Y"
-      },
-      "source": [
-        "# top_p sampling\n",
-        "Instead of sampling only from the most likely *K* token ids, in *Top-p* sampling chooses from the smallest possible set of ids whose cumulative probability exceeds the probability *p*."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rEGdIWcuWILO"
-      },
-      "outputs": [],
-      "source": [
-        "top_p_obj = sampling_module.SamplingModule(\n",
-        "    length_normalization_fn=length_norm,\n",
-        "    dtype=tf.float32,\n",
-        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
-        "    vocab_size=3,\n",
-        "    max_decode_length=params['max_decode_length'],\n",
-        "    eos_id=10,\n",
-        "    sample_temperature=tf.constant(1.0),\n",
-        "    top_p=tf.constant(0.9),\n",
-        "    padded_decode=False,\n",
-        "    enable_greedy=False)\n",
-        "ids, _ = top_p_obj.generate(\n",
-        "    initial_ids=tf.constant([9, 1]), initial_cache=cache)\n",
-        "print(\"top-p sampled Ids:\", ids)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2hcuyJ2VWjDz"
-      },
-      "source": [
-        "# Beam search decoding\n",
-        "Beam search reduces the risk of missing hidden high probability token ids by keeping the most likely num_beams of hypotheses at each time step and eventually choosing the hypothesis that has the overall highest probability. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "cJ3WzvSrWmSA"
-      },
-      "outputs": [],
-      "source": [
-        "beam_size = 2\n",
-        "params['batch_size'] = 1\n",
-        "beam_cache = {\n",
-        "    'layer_%d' % layer: {\n",
-        "        'k': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']], dtype=tf.float32),\n",
-        "        'v': tf.zeros([params['batch_size'], params['max_decode_length'], params['num_heads'], params['n_dims']], dtype=tf.float32)\n",
-        "        } for layer in range(params['num_layers'])\n",
-        "    }\n",
-        "print(\"cache key shape for layer 1 :\", beam_cache['layer_1']['k'].shape)\n",
-        "ids, _ = beam_search.sequence_beam_search(\n",
-        "    symbols_to_logits_fn=_symbols_to_logits_fn(),\n",
-        "    initial_ids=tf.constant([9], tf.int32),\n",
-        "    initial_cache=beam_cache,\n",
-        "    vocab_size=3,\n",
-        "    beam_size=beam_size,\n",
-        "    alpha=0.6,\n",
-        "    max_decode_length=params['max_decode_length'],\n",
-        "    eos_id=10,\n",
-        "    padded_decode=False,\n",
-        "    dtype=tf.float32)\n",
-        "print(\"Beam search ids:\", ids)"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "decoding_api_in_tf_nlp.ipynb",
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/fine_tuning_bert.ipynb b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/fine_tuning_bert.ipynb
deleted file mode 100644
index ad34d68d66770273a055cbaf345c52df734bfa79..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/fine_tuning_bert.ipynb
+++ /dev/null
@@ -1,1678 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "vXLA5InzXydn"
-      },
-      "source": [
-        "##### Copyright 2019 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "RuRlpLL-X0R_"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1mLJmVotXs64"
-      },
-      "source": [
-        "# Fine-tuning a BERT model"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hYEwGTeCXnnX"
-      },
-      "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/tutorials/fine_tune_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/fine_tuning_bert.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca href=\"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/hub_logo_32px.png\" /\u003eSee TF Hub model\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "YN2ACivEPxgD"
-      },
-      "source": [
-        "In this example, we will work through fine-tuning a BERT model using the tensorflow-models PIP package.\n",
-        "\n",
-        "The pretrained BERT model this tutorial is based on is also available on [TensorFlow Hub](https://tensorflow.org/hub), to see how to use it refer to the [Hub Appendix](#hub_bert)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "s2d9S2CSSO1z"
-      },
-      "source": [
-        "## Setup"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fsACVQpVSifi"
-      },
-      "source": [
-        "### Install the TensorFlow Model Garden pip package\n",
-        "\n",
-        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
-        "which is the nightly Model Garden package created daily automatically.\n",
-        "*  pip will install all models and dependencies automatically."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "NvNr2svBM-p3"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -q tf-models-official==2.4.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "U-7qPCjWUAyy"
-      },
-      "source": [
-        "### Imports"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "lXsXev5MNr20"
-      },
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "\n",
-        "import numpy as np\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "import tensorflow as tf\n",
-        "\n",
-        "import tensorflow_hub as hub\n",
-        "import tensorflow_datasets as tfds\n",
-        "tfds.disable_progress_bar()\n",
-        "\n",
-        "from official.modeling import tf_utils\n",
-        "from official import nlp\n",
-        "from official.nlp import bert\n",
-        "\n",
-        "# Load the required submodules\n",
-        "import official.nlp.optimization\n",
-        "import official.nlp.bert.bert_models\n",
-        "import official.nlp.bert.configs\n",
-        "import official.nlp.bert.run_classifier\n",
-        "import official.nlp.bert.tokenization\n",
-        "import official.nlp.data.classifier_data_lib\n",
-        "import official.nlp.modeling.losses\n",
-        "import official.nlp.modeling.models\n",
-        "import official.nlp.modeling.networks\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mbanlzTvJBsz"
-      },
-      "source": [
-        "### Resources"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "PpW0x8TpR8DT"
-      },
-      "source": [
-        "This directory contains the configuration, vocabulary, and a pre-trained checkpoint used in this tutorial:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "vzRHOLciR8eq"
-      },
-      "outputs": [],
-      "source": [
-        "gs_folder_bert = \"gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12\"\n",
-        "tf.io.gfile.listdir(gs_folder_bert)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9uFskufsR2LT"
-      },
-      "source": [
-        "You can get a pre-trained BERT encoder from [TensorFlow Hub](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2):"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "e0dAkUttJAzj"
-      },
-      "outputs": [],
-      "source": [
-        "hub_url_bert = \"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3\""
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Qv6abtRvH4xO"
-      },
-      "source": [
-        "## The data\n",
-        "For this example we used the [GLUE MRPC dataset from TFDS](https://www.tensorflow.org/datasets/catalog/glue#gluemrpc).\n",
-        "\n",
-        "This dataset is not set up so that it can be directly fed into the BERT model, so this section also handles the necessary preprocessing."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "28DvUhC1YUiB"
-      },
-      "source": [
-        "### Get the dataset from TensorFlow Datasets\n",
-        "\n",
-        "The Microsoft Research Paraphrase Corpus (Dolan \u0026 Brockett, 2005) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.\n",
-        "\n",
-        "*   Number of labels: 2.\n",
-        "*   Size of training dataset: 3668.\n",
-        "*   Size of evaluation dataset: 408.\n",
-        "*   Maximum sequence length of training and evaluation dataset: 128.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Ijikx5OsH9AT"
-      },
-      "outputs": [],
-      "source": [
-        "glue, info = tfds.load('glue/mrpc', with_info=True,\n",
-        "                       # It's small, load the whole dataset\n",
-        "                       batch_size=-1)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "xf9zz4vLYXjr"
-      },
-      "outputs": [],
-      "source": [
-        "list(glue.keys())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZgBg2r2nYT-K"
-      },
-      "source": [
-        "The `info` object describes the dataset and it's features:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "IQrHxv7W7jH5"
-      },
-      "outputs": [],
-      "source": [
-        "info.features"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "vhsVWYNxazz5"
-      },
-      "source": [
-        "The two classes are:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "n0gfc_VTayfQ"
-      },
-      "outputs": [],
-      "source": [
-        "info.features['label'].names"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "38zJcap6xkbC"
-      },
-      "source": [
-        "Here is one example from the training set:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "xON_i6SkwApW"
-      },
-      "outputs": [],
-      "source": [
-        "glue_train = glue['train']\n",
-        "\n",
-        "for key, value in glue_train.items():\n",
-        "  print(f\"{key:9s}: {value[0].numpy()}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9fbTyfJpNr7x"
-      },
-      "source": [
-        "### The BERT tokenizer"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "wqeN54S61ZKQ"
-      },
-      "source": [
-        "To fine tune a pre-trained model you need to be sure that you're using exactly the same tokenization, vocabulary, and index mapping as you used during training.\n",
-        "\n",
-        "The BERT tokenizer used in this tutorial is written in pure Python (It's not built out of TensorFlow ops). So you can't just plug it into your model as a `keras.layer` like you can with `preprocessing.TextVectorization`.\n",
-        "\n",
-        "The following code rebuilds the tokenizer that was used by the base model:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "idxyhmrCQcw5"
-      },
-      "outputs": [],
-      "source": [
-        "# Set up tokenizer to generate Tensorflow dataset\n",
-        "tokenizer = bert.tokenization.FullTokenizer(\n",
-        "    vocab_file=os.path.join(gs_folder_bert, \"vocab.txt\"),\n",
-        "     do_lower_case=True)\n",
-        "\n",
-        "print(\"Vocab size:\", len(tokenizer.vocab))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "zYHDSquU2lDU"
-      },
-      "source": [
-        "Tokenize a sentence:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "L_OfOYPg853R"
-      },
-      "outputs": [],
-      "source": [
-        "tokens = tokenizer.tokenize(\"Hello TensorFlow!\")\n",
-        "print(tokens)\n",
-        "ids = tokenizer.convert_tokens_to_ids(tokens)\n",
-        "print(ids)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "kkAXLtuyWWDI"
-      },
-      "source": [
-        "### Preprocess the data\n",
-        "\n",
-        "The section manually preprocessed the dataset into the format expected by the model.\n",
-        "\n",
-        "This dataset is small, so preprocessing can be done quickly and easily in memory. For larger datasets the `tf_models` library includes some tools for preprocessing and re-serializing a dataset. See [Appendix: Re-encoding a large dataset](#re_encoding_tools) for details."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "62UTWLQd9-LB"
-      },
-      "source": [
-        "#### Encode the sentences\n",
-        "\n",
-        "The model expects its two inputs sentences to be concatenated together. This input is expected to start with a `[CLS]` \"This is a classification problem\" token, and each sentence should end with a `[SEP]` \"Separator\" token:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "bdL-dRNRBRJT"
-      },
-      "outputs": [],
-      "source": [
-        "tokenizer.convert_tokens_to_ids(['[CLS]', '[SEP]'])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "UrPktnqpwqie"
-      },
-      "source": [
-        "Start by encoding all the sentences while appending a `[SEP]` token, and packing them into ragged-tensors:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "BR7BmtU498Bh"
-      },
-      "outputs": [],
-      "source": [
-        "def encode_sentence(s):\n",
-        "   tokens = list(tokenizer.tokenize(s.numpy()))\n",
-        "   tokens.append('[SEP]')\n",
-        "   return tokenizer.convert_tokens_to_ids(tokens)\n",
-        "\n",
-        "sentence1 = tf.ragged.constant([\n",
-        "    encode_sentence(s) for s in glue_train[\"sentence1\"]])\n",
-        "sentence2 = tf.ragged.constant([\n",
-        "    encode_sentence(s) for s in glue_train[\"sentence2\"]])"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "has42aUdfky-"
-      },
-      "outputs": [],
-      "source": [
-        "print(\"Sentence1 shape:\", sentence1.shape.as_list())\n",
-        "print(\"Sentence2 shape:\", sentence2.shape.as_list())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MU9lTWy_xXbb"
-      },
-      "source": [
-        "Now prepend a `[CLS]` token, and concatenate the ragged tensors to form a single `input_word_ids` tensor for each example. `RaggedTensor.to_tensor()` zero pads to the longest sequence."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "USD8uihw-g4J"
-      },
-      "outputs": [],
-      "source": [
-        "cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]\n",
-        "input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)\n",
-        "_ = plt.pcolormesh(input_word_ids.to_tensor())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xmNv4l4k-dBZ"
-      },
-      "source": [
-        "#### Mask and input type"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "DIWjNIKq-ldh"
-      },
-      "source": [
-        "The model expects two additional inputs:\n",
-        "\n",
-        "* The input mask\n",
-        "* The input type"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ulNZ4U96-8JZ"
-      },
-      "source": [
-        "The mask allows the model to cleanly differentiate between the content and the padding. The mask has the same shape as the `input_word_ids`, and contains a `1` anywhere the `input_word_ids` is not padding."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "EezOO9qj91kP"
-      },
-      "outputs": [],
-      "source": [
-        "input_mask = tf.ones_like(input_word_ids).to_tensor()\n",
-        "\n",
-        "plt.pcolormesh(input_mask)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rxLenwAvCkBf"
-      },
-      "source": [
-        "The \"input type\" also has the same shape, but inside the non-padded region, contains a `0` or a `1` indicating which sentence the token is a part of. "
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "2CetH_5C9P2m"
-      },
-      "outputs": [],
-      "source": [
-        "type_cls = tf.zeros_like(cls)\n",
-        "type_s1 = tf.zeros_like(sentence1)\n",
-        "type_s2 = tf.ones_like(sentence2)\n",
-        "input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()\n",
-        "\n",
-        "plt.pcolormesh(input_type_ids)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "P5UBnCn8Ii6s"
-      },
-      "source": [
-        "#### Put it all together\n",
-        "\n",
-        "Collect the above text parsing code into a single function, and apply it to each split of the `glue/mrpc` dataset."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "sDGiWYPLEd5a"
-      },
-      "outputs": [],
-      "source": [
-        "def encode_sentence(s, tokenizer):\n",
-        "   tokens = list(tokenizer.tokenize(s))\n",
-        "   tokens.append('[SEP]')\n",
-        "   return tokenizer.convert_tokens_to_ids(tokens)\n",
-        "\n",
-        "def bert_encode(glue_dict, tokenizer):\n",
-        "  num_examples = len(glue_dict[\"sentence1\"])\n",
-        "  \n",
-        "  sentence1 = tf.ragged.constant([\n",
-        "      encode_sentence(s, tokenizer)\n",
-        "      for s in np.array(glue_dict[\"sentence1\"])])\n",
-        "  sentence2 = tf.ragged.constant([\n",
-        "      encode_sentence(s, tokenizer)\n",
-        "       for s in np.array(glue_dict[\"sentence2\"])])\n",
-        "\n",
-        "  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]\n",
-        "  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)\n",
-        "\n",
-        "  input_mask = tf.ones_like(input_word_ids).to_tensor()\n",
-        "\n",
-        "  type_cls = tf.zeros_like(cls)\n",
-        "  type_s1 = tf.zeros_like(sentence1)\n",
-        "  type_s2 = tf.ones_like(sentence2)\n",
-        "  input_type_ids = tf.concat(\n",
-        "      [type_cls, type_s1, type_s2], axis=-1).to_tensor()\n",
-        "\n",
-        "  inputs = {\n",
-        "      'input_word_ids': input_word_ids.to_tensor(),\n",
-        "      'input_mask': input_mask,\n",
-        "      'input_type_ids': input_type_ids}\n",
-        "\n",
-        "  return inputs"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "yuLKxf6zHxw-"
-      },
-      "outputs": [],
-      "source": [
-        "glue_train = bert_encode(glue['train'], tokenizer)\n",
-        "glue_train_labels = glue['train']['label']\n",
-        "\n",
-        "glue_validation = bert_encode(glue['validation'], tokenizer)\n",
-        "glue_validation_labels = glue['validation']['label']\n",
-        "\n",
-        "glue_test = bert_encode(glue['test'], tokenizer)\n",
-        "glue_test_labels  = glue['test']['label']"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "7FC5aLVxKVKK"
-      },
-      "source": [
-        "Each subset of the data has been converted to a dictionary of features, and a set of labels. Each feature in the input dictionary has the same shape, and the number of labels should match:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "jyjTdGpFhO_1"
-      },
-      "outputs": [],
-      "source": [
-        "for key, value in glue_train.items():\n",
-        "  print(f'{key:15s} shape: {value.shape}')\n",
-        "\n",
-        "print(f'glue_train_labels shape: {glue_train_labels.shape}')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "FSwymsbkbLDA"
-      },
-      "source": [
-        "## The model"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Efrj3Cn1kLAp"
-      },
-      "source": [
-        "### Build the model\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xxpOY5r2Ayq6"
-      },
-      "source": [
-        "The first step is to download the configuration  for the pre-trained model.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ujapVfZ_AKW7"
-      },
-      "outputs": [],
-      "source": [
-        "import json\n",
-        "\n",
-        "bert_config_file = os.path.join(gs_folder_bert, \"bert_config.json\")\n",
-        "config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())\n",
-        "\n",
-        "bert_config = bert.configs.BertConfig.from_dict(config_dict)\n",
-        "\n",
-        "config_dict"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "96ldxDSwkVkj"
-      },
-      "source": [
-        "The `config` defines the core BERT Model, which is a Keras model to predict the outputs of `num_classes` from the inputs with maximum sequence length `max_seq_length`.\n",
-        "\n",
-        "This function returns both the encoder and the classifier."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "cH682__U0FBv"
-      },
-      "outputs": [],
-      "source": [
-        "bert_classifier, bert_encoder = bert.bert_models.classifier_model(\n",
-        "    bert_config, num_labels=2)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XqKp3-5GIZlw"
-      },
-      "source": [
-        "The classifier has three inputs and one output:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "bAQblMIjwkvx"
-      },
-      "outputs": [],
-      "source": [
-        "tf.keras.utils.plot_model(bert_classifier, show_shapes=True, dpi=48)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "sFmVG4SKZAw8"
-      },
-      "source": [
-        "Run it on a test batch of data 10 examples from the training set. The output is the logits for the two classes:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "VTjgPbp4ZDKo"
-      },
-      "outputs": [],
-      "source": [
-        "glue_batch = {key: val[:10] for key, val in glue_train.items()}\n",
-        "\n",
-        "bert_classifier(\n",
-        "    glue_batch, training=True\n",
-        ").numpy()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Q0NTdwZsQK8n"
-      },
-      "source": [
-        "The `TransformerEncoder` in the center of the classifier above **is** the `bert_encoder`.\n",
-        "\n",
-        "Inspecting the encoder, we see its stack of `Transformer` layers connected to those same three inputs:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "8L__-erBwLIQ"
-      },
-      "outputs": [],
-      "source": [
-        "tf.keras.utils.plot_model(bert_encoder, show_shapes=True, dpi=48)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mKAvkQc3heSy"
-      },
-      "source": [
-        "### Restore the encoder weights\n",
-        "\n",
-        "When built the encoder is randomly initialized. Restore the encoder's weights from the checkpoint:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "97Ll2Gichd_Y"
-      },
-      "outputs": [],
-      "source": [
-        "checkpoint = tf.train.Checkpoint(encoder=bert_encoder)\n",
-        "checkpoint.read(\n",
-        "    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2oHOql35k3Dd"
-      },
-      "source": [
-        "Note: The pretrained `TransformerEncoder` is also available on [TensorFlow Hub](https://tensorflow.org/hub). See the [Hub appendix](#hub_bert) for details. "
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "115caFLMk-_l"
-      },
-      "source": [
-        "### Set up the optimizer\n",
-        "\n",
-        "BERT adopts the Adam optimizer with weight decay (aka \"[AdamW](https://arxiv.org/abs/1711.05101)\").\n",
-        "It also employs a learning rate schedule that firstly warms up from 0 and then decays to 0."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "w8qXKRZuCwW4"
-      },
-      "outputs": [],
-      "source": [
-        "# Set up epochs and steps\n",
-        "epochs = 3\n",
-        "batch_size = 32\n",
-        "eval_batch_size = 32\n",
-        "\n",
-        "train_data_size = len(glue_train_labels)\n",
-        "steps_per_epoch = int(train_data_size / batch_size)\n",
-        "num_train_steps = steps_per_epoch * epochs\n",
-        "warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)\n",
-        "\n",
-        "# creates an optimizer with learning rate schedule\n",
-        "optimizer = nlp.optimization.create_optimizer(\n",
-        "    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pXRGxiRNEHS2"
-      },
-      "source": [
-        "This returns an `AdamWeightDecay`  optimizer with the learning rate schedule set:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "eQNA16bhDpky"
-      },
-      "outputs": [],
-      "source": [
-        "type(optimizer)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xqu_K71fJQB8"
-      },
-      "source": [
-        "To see an example of how to customize the optimizer and it's schedule, see the [Optimizer schedule appendix](#optiizer_schedule)."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "78FEUOOEkoP0"
-      },
-      "source": [
-        "### Train the model"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "OTNcA0O0nSq9"
-      },
-      "source": [
-        "The metric is accuracy and we use sparse categorical cross-entropy as loss."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "nzi8hjeTQTRs"
-      },
-      "outputs": [],
-      "source": [
-        "metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]\n",
-        "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
-        "\n",
-        "bert_classifier.compile(\n",
-        "    optimizer=optimizer,\n",
-        "    loss=loss,\n",
-        "    metrics=metrics)\n",
-        "\n",
-        "bert_classifier.fit(\n",
-        "      glue_train, glue_train_labels,\n",
-        "      validation_data=(glue_validation, glue_validation_labels),\n",
-        "      batch_size=32,\n",
-        "      epochs=epochs)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "IFtKFWbNKb0u"
-      },
-      "source": [
-        "Now run the fine-tuned model on a custom example to see that it works.\n",
-        "\n",
-        "Start by encoding some sentence pairs:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9ZoUgDUNJPz3"
-      },
-      "outputs": [],
-      "source": [
-        "my_examples = bert_encode(\n",
-        "    glue_dict = {\n",
-        "        'sentence1':[\n",
-        "            'The rain in Spain falls mainly on the plain.',\n",
-        "            'Look I fine tuned BERT.'],\n",
-        "        'sentence2':[\n",
-        "            'It mostly rains on the flat lands of Spain.',\n",
-        "            'Is it working? This does not match.']\n",
-        "    },\n",
-        "    tokenizer=tokenizer)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "7ynJibkBRTJF"
-      },
-      "source": [
-        "The model should report class `1` \"match\" for the first example and class `0` \"no-match\" for the second:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "umo0ttrgRYIM"
-      },
-      "outputs": [],
-      "source": [
-        "result = bert_classifier(my_examples, training=False)\n",
-        "\n",
-        "result = tf.argmax(result, axis=-1).numpy()\n",
-        "result"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "utGl0M3aZCE4"
-      },
-      "outputs": [],
-      "source": [
-        "np.array(info.features['label'].names)[result]"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fVo_AnT0l26j"
-      },
-      "source": [
-        "### Save the model\n",
-        "\n",
-        "Often the goal of training a model is to _use_ it for something, so export the model and then restore it to be sure that it works."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Nl5x6nElZqkP"
-      },
-      "outputs": [],
-      "source": [
-        "export_dir='./saved_model'\n",
-        "tf.saved_model.save(bert_classifier, export_dir=export_dir)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": true,
-        "id": "y_ACvKPsVUXC"
-      },
-      "outputs": [],
-      "source": [
-        "reloaded = tf.saved_model.load(export_dir)\n",
-        "reloaded_result = reloaded([my_examples['input_word_ids'],\n",
-        "                            my_examples['input_mask'],\n",
-        "                            my_examples['input_type_ids']], training=False)\n",
-        "\n",
-        "original_result = bert_classifier(my_examples, training=False)\n",
-        "\n",
-        "# The results are (nearly) identical:\n",
-        "print(original_result.numpy())\n",
-        "print()\n",
-        "print(reloaded_result.numpy())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "eQceYqRFT_Eg"
-      },
-      "source": [
-        "## Appendix"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "SaC1RlFawUpc"
-      },
-      "source": [
-        "\u003ca id=re_encoding_tools\u003e\u003c/a\u003e\n",
-        "### Re-encoding a large dataset"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "CwUdjFBkzUgh"
-      },
-      "source": [
-        "This tutorial you re-encoded the dataset in memory, for clarity.\n",
-        "\n",
-        "This was only possible because `glue/mrpc` is a very small dataset. To deal with larger datasets `tf_models` library includes some tools for processing and re-encoding a dataset for efficient training."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2UTQrkyOT5wD"
-      },
-      "source": [
-        "The first step is to describe which features of the dataset should be transformed:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "XQeDFOzYR9Z9"
-      },
-      "outputs": [],
-      "source": [
-        "processor = nlp.data.classifier_data_lib.TfdsProcessor(\n",
-        "    tfds_params=\"dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2\",\n",
-        "    process_text_fn=bert.tokenization.convert_to_unicode)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XrFQbfErUWxa"
-      },
-      "source": [
-        "Then apply the transformation to generate new TFRecord files."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "ymw7GOHpSHKU"
-      },
-      "outputs": [],
-      "source": [
-        "# Set up output of training and evaluation Tensorflow dataset\n",
-        "train_data_output_path=\"./mrpc_train.tf_record\"\n",
-        "eval_data_output_path=\"./mrpc_eval.tf_record\"\n",
-        "\n",
-        "max_seq_length = 128\n",
-        "batch_size = 32\n",
-        "eval_batch_size = 32\n",
-        "\n",
-        "# Generate and save training data into a tf record file\n",
-        "input_meta_data = (\n",
-        "    nlp.data.classifier_data_lib.generate_tf_record_from_data_file(\n",
-        "      processor=processor,\n",
-        "      data_dir=None,  # It is `None` because data is from tfds, not local dir.\n",
-        "      tokenizer=tokenizer,\n",
-        "      train_data_output_path=train_data_output_path,\n",
-        "      eval_data_output_path=eval_data_output_path,\n",
-        "      max_seq_length=max_seq_length))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "uX_Sp-wTUoRm"
-      },
-      "source": [
-        "Finally create `tf.data` input pipelines from those TFRecord files:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rkHxIK57SQ_r"
-      },
-      "outputs": [],
-      "source": [
-        "training_dataset = bert.run_classifier.get_dataset_fn(\n",
-        "    train_data_output_path,\n",
-        "    max_seq_length,\n",
-        "    batch_size,\n",
-        "    is_training=True)()\n",
-        "\n",
-        "evaluation_dataset = bert.run_classifier.get_dataset_fn(\n",
-        "    eval_data_output_path,\n",
-        "    max_seq_length,\n",
-        "    eval_batch_size,\n",
-        "    is_training=False)()\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "stbaVouogvzS"
-      },
-      "source": [
-        "The resulting `tf.data.Datasets` return `(features, labels)` pairs, as expected by `keras.Model.fit`:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "gwhrlQl4gxVF"
-      },
-      "outputs": [],
-      "source": [
-        "training_dataset.element_spec"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dbJ76vSJj77j"
-      },
-      "source": [
-        "#### Create tf.data.Dataset for training and evaluation\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9J95LFRohiYw"
-      },
-      "source": [
-        "If you need to modify the data loading here is some code to get you started:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "gCvaLLAxPuMc"
-      },
-      "outputs": [],
-      "source": [
-        "def create_classifier_dataset(file_path, seq_length, batch_size, is_training):\n",
-        "  \"\"\"Creates input dataset from (tf)records files for train/eval.\"\"\"\n",
-        "  dataset = tf.data.TFRecordDataset(file_path)\n",
-        "  if is_training:\n",
-        "    dataset = dataset.shuffle(100)\n",
-        "    dataset = dataset.repeat()\n",
-        "\n",
-        "  def decode_record(record):\n",
-        "    name_to_features = {\n",
-        "      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
-        "      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
-        "      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),\n",
-        "      'label_ids': tf.io.FixedLenFeature([], tf.int64),\n",
-        "    }\n",
-        "    return tf.io.parse_single_example(record, name_to_features)\n",
-        "\n",
-        "  def _select_data_from_record(record):\n",
-        "    x = {\n",
-        "        'input_word_ids': record['input_ids'],\n",
-        "        'input_mask': record['input_mask'],\n",
-        "        'input_type_ids': record['segment_ids']\n",
-        "    }\n",
-        "    y = record['label_ids']\n",
-        "    return (x, y)\n",
-        "\n",
-        "  dataset = dataset.map(decode_record,\n",
-        "                        num_parallel_calls=tf.data.experimental.AUTOTUNE)\n",
-        "  dataset = dataset.map(\n",
-        "      _select_data_from_record,\n",
-        "      num_parallel_calls=tf.data.experimental.AUTOTUNE)\n",
-        "  dataset = dataset.batch(batch_size, drop_remainder=is_training)\n",
-        "  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)\n",
-        "  return dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rutkBadrhzdR"
-      },
-      "outputs": [],
-      "source": [
-        "# Set up batch sizes\n",
-        "batch_size = 32\n",
-        "eval_batch_size = 32\n",
-        "\n",
-        "# Return Tensorflow dataset\n",
-        "training_dataset = create_classifier_dataset(\n",
-        "    train_data_output_path,\n",
-        "    input_meta_data['max_seq_length'],\n",
-        "    batch_size,\n",
-        "    is_training=True)\n",
-        "\n",
-        "evaluation_dataset = create_classifier_dataset(\n",
-        "    eval_data_output_path,\n",
-        "    input_meta_data['max_seq_length'],\n",
-        "    eval_batch_size,\n",
-        "    is_training=False)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "59TVgt4Z7fuU"
-      },
-      "outputs": [],
-      "source": [
-        "training_dataset.element_spec"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "QbklKt-w_CiI"
-      },
-      "source": [
-        "\u003ca id=\"hub_bert\"\u003e\u003c/a\u003e\n",
-        "\n",
-        "### TFModels BERT on TFHub\n",
-        "\n",
-        "You can get [the BERT model](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2) off the shelf from [TFHub](https://tensorflow.org/hub). It would not be hard to add a classification head on top of this `hub.KerasLayer`"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "GDWrHm0BGpbX"
-      },
-      "outputs": [],
-      "source": [
-        "# Note: 350MB download.\n",
-        "import tensorflow_hub as hub"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "Y29meH0qGq_5"
-      },
-      "outputs": [],
-      "source": [
-        "hub_model_name = \"bert_en_uncased_L-12_H-768_A-12\" #@param [\"bert_en_uncased_L-24_H-1024_A-16\", \"bert_en_wwm_cased_L-24_H-1024_A-16\", \"bert_en_uncased_L-12_H-768_A-12\", \"bert_en_wwm_uncased_L-24_H-1024_A-16\", \"bert_en_cased_L-24_H-1024_A-16\", \"bert_en_cased_L-12_H-768_A-12\", \"bert_zh_L-12_H-768_A-12\", \"bert_multi_cased_L-12_H-768_A-12\"]"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "lo6479At4sP1"
-      },
-      "outputs": [],
-      "source": [
-        "hub_encoder = hub.KerasLayer(f\"https://tfhub.dev/tensorflow/{hub_model_name}/3\",\n",
-        "                             trainable=True)\n",
-        "\n",
-        "print(f\"The Hub encoder has {len(hub_encoder.trainable_variables)} trainable variables\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "iTzF574wivQv"
-      },
-      "source": [
-        "Test run it on a batch of data:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "XEcYrCR45Uwo"
-      },
-      "outputs": [],
-      "source": [
-        "result = hub_encoder(\n",
-        "    inputs=dict(\n",
-        "        input_word_ids=glue_train['input_word_ids'][:10],\n",
-        "        input_mask=glue_train['input_mask'][:10],\n",
-        "        input_type_ids=glue_train['input_type_ids'][:10],),\n",
-        "    training=False,\n",
-        ")\n",
-        "\n",
-        "print(\"Pooled output shape:\", result['pooled_output'].shape)\n",
-        "print(\"Sequence output shape:\", result['sequence_output'].shape)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "cjojn8SmLSRI"
-      },
-      "source": [
-        "At this point it would be simple to add a classification head yourself.\n",
-        "\n",
-        "The `bert_models.classifier_model` function can also build a classifier onto the encoder from TensorFlow Hub:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9nTDaApyLR70"
-      },
-      "outputs": [],
-      "source": [
-        "hub_classifier = nlp.modeling.models.BertClassifier(\n",
-        "    bert_encoder,\n",
-        "    num_classes=2,\n",
-        "    dropout_rate=0.1,\n",
-        "    initializer=tf.keras.initializers.TruncatedNormal(\n",
-        "        stddev=0.02))"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xMJX3wV0_v7I"
-      },
-      "source": [
-        "The one downside to loading this model from TFHub is that the structure of internal keras layers is not restored. So it's more difficult to inspect or modify the model. The `BertEncoder` model is now a single layer:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "pD71dnvhM2QS"
-      },
-      "outputs": [],
-      "source": [
-        "tf.keras.utils.plot_model(hub_classifier, show_shapes=True, dpi=64)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "nLZD-isBzNKi"
-      },
-      "outputs": [],
-      "source": [
-        "try:\n",
-        "  tf.keras.utils.plot_model(hub_encoder, show_shapes=True, dpi=64)\n",
-        "  assert False\n",
-        "except Exception as e:\n",
-        "  print(f\"{type(e).__name__}: {e}\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "ZxSqH0dNAgXV"
-      },
-      "source": [
-        "\u003ca id=\"model_builder_functions\"\u003e\u003c/a\u003e\n",
-        "\n",
-        "### Low level model building\n",
-        "\n",
-        "If you need a more control over the construction of the model it's worth noting that the `classifier_model` function used earlier is really just a thin wrapper over the `nlp.modeling.networks.BertEncoder` and `nlp.modeling.models.BertClassifier` classes. Just remember that if you start modifying the architecture it may not be correct or possible to reload the pre-trained checkpoint so you'll need to retrain from scratch."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0cgABEwDj06P"
-      },
-      "source": [
-        "Build the encoder:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "5r_yqhBFSVEM"
-      },
-      "outputs": [],
-      "source": [
-        "bert_encoder_config = config_dict.copy()\n",
-        "\n",
-        "# You need to rename a few fields to make this work:\n",
-        "bert_encoder_config['attention_dropout_rate'] = bert_encoder_config.pop('attention_probs_dropout_prob')\n",
-        "bert_encoder_config['activation'] = tf_utils.get_activation(bert_encoder_config.pop('hidden_act'))\n",
-        "bert_encoder_config['dropout_rate'] = bert_encoder_config.pop('hidden_dropout_prob')\n",
-        "bert_encoder_config['initializer'] = tf.keras.initializers.TruncatedNormal(\n",
-        "          stddev=bert_encoder_config.pop('initializer_range'))\n",
-        "bert_encoder_config['max_sequence_length'] = bert_encoder_config.pop('max_position_embeddings')\n",
-        "bert_encoder_config['num_layers'] = bert_encoder_config.pop('num_hidden_layers')\n",
-        "\n",
-        "bert_encoder_config"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "rIO8MI7LLijh"
-      },
-      "outputs": [],
-      "source": [
-        "manual_encoder = nlp.modeling.networks.BertEncoder(**bert_encoder_config)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "4a4tFSg9krRi"
-      },
-      "source": [
-        "Restore the weights:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "X6N9NEqfXJCx"
-      },
-      "outputs": [],
-      "source": [
-        "checkpoint = tf.train.Checkpoint(encoder=manual_encoder)\n",
-        "checkpoint.read(\n",
-        "    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "1BPiPO4ykuwM"
-      },
-      "source": [
-        "Test run it:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "hlVdgJKmj389"
-      },
-      "outputs": [],
-      "source": [
-        "result = manual_encoder(my_examples, training=True)\n",
-        "\n",
-        "print(\"Sequence output shape:\", result[0].shape)\n",
-        "print(\"Pooled output shape:\", result[1].shape)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nJMXvVgJkyBv"
-      },
-      "source": [
-        "Wrap it in a classifier:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "tQX57GJ6wkAb"
-      },
-      "outputs": [],
-      "source": [
-        "manual_classifier = nlp.modeling.models.BertClassifier(\n",
-        "        bert_encoder,\n",
-        "        num_classes=2,\n",
-        "        dropout_rate=bert_encoder_config['dropout_rate'],\n",
-        "        initializer=bert_encoder_config['initializer'])"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "kB-nBWhQk0dS"
-      },
-      "outputs": [],
-      "source": [
-        "manual_classifier(my_examples, training=True).numpy()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "E6AJlOSyIO1L"
-      },
-      "source": [
-        "\u003ca id=\"optiizer_schedule\"\u003e\u003c/a\u003e\n",
-        "\n",
-        "### Optimizers and schedules\n",
-        "\n",
-        "The optimizer used to train the model was created using the `nlp.optimization.create_optimizer` function:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "28Dv3BPRlFTD"
-      },
-      "outputs": [],
-      "source": [
-        "optimizer = nlp.optimization.create_optimizer(\n",
-        "    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "LRjcHr0UlT8c"
-      },
-      "source": [
-        "That high level wrapper sets up the learning rate schedules and the optimizer.\n",
-        "\n",
-        "The base learning rate schedule used here is a linear decay to zero over the training run:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "MHY8K6kDngQn"
-      },
-      "outputs": [],
-      "source": [
-        "epochs = 3\n",
-        "batch_size = 32\n",
-        "eval_batch_size = 32\n",
-        "\n",
-        "train_data_size = len(glue_train_labels)\n",
-        "steps_per_epoch = int(train_data_size / batch_size)\n",
-        "num_train_steps = steps_per_epoch * epochs"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": true,
-        "id": "wKIcSprulu3P"
-      },
-      "outputs": [],
-      "source": [
-        "decay_schedule = tf.keras.optimizers.schedules.PolynomialDecay(\n",
-        "      initial_learning_rate=2e-5,\n",
-        "      decay_steps=num_train_steps,\n",
-        "      end_learning_rate=0)\n",
-        "\n",
-        "plt.plot([decay_schedule(n) for n in range(num_train_steps)])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "IMTC_gfAl_PZ"
-      },
-      "source": [
-        "This, in turn is wrapped in a `WarmUp` schedule that linearly increases the learning rate to the target value over the first 10% of training:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "YRt3VTmBmCBY"
-      },
-      "outputs": [],
-      "source": [
-        "warmup_steps = num_train_steps * 0.1\n",
-        "\n",
-        "warmup_schedule = nlp.optimization.WarmUp(\n",
-        "        initial_learning_rate=2e-5,\n",
-        "        decay_schedule_fn=decay_schedule,\n",
-        "        warmup_steps=warmup_steps)\n",
-        "\n",
-        "# The warmup overshoots, because it warms up to the `initial_learning_rate`\n",
-        "# following the original implementation. You can set\n",
-        "# `initial_learning_rate=decay_schedule(warmup_steps)` if you don't like the\n",
-        "# overshoot.\n",
-        "plt.plot([warmup_schedule(n) for n in range(num_train_steps)])"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "l8D9Lv3Bn740"
-      },
-      "source": [
-        "Then create the `nlp.optimization.AdamWeightDecay` using that schedule, configured for the BERT model:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "2Hf2rpRXk89N"
-      },
-      "outputs": [],
-      "source": [
-        "optimizer = nlp.optimization.AdamWeightDecay(\n",
-        "        learning_rate=warmup_schedule,\n",
-        "        weight_decay_rate=0.01,\n",
-        "        epsilon=1e-6,\n",
-        "        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "collapsed_sections": [],
-      "name": "fine_tuning_bert.ipynb",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/nlp/customize_encoder.ipynb b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/nlp/customize_encoder.ipynb
deleted file mode 100644
index aeddb29f96352fbd4c8df3540e6bd4b8fe70bb8b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/nlp/customize_encoder.ipynb
+++ /dev/null
@@ -1,575 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "Customizing a Transformer Encoder",
-      "private_outputs": true,
-      "provenance": [],
-      "collapsed_sections": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Bp8t2AI8i7uP"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "cellView": "form",
-        "id": "rxPj2Lsni9O4"
-      },
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6xS-9i5DrRvO"
-      },
-      "source": [
-        "# Customizing a Transformer Encoder"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Mwb9uw1cDXsa"
-      },
-      "source": [
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/customize_encoder\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/customize_encoder.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "iLrcV4IyrcGX"
-      },
-      "source": [
-        "## Learning objectives\n",
-        "\n",
-        "The [TensorFlow Models NLP library](https://github.com/tensorflow/models/tree/master/official/nlp/modeling) is a collection of tools for building and training modern high performance natural language models.\n",
-        "\n",
-        "The [TransformEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/encoder_scaffold.py) is the core of this library, and lots of new network architectures are proposed to improve the encoder. In this Colab notebook, we will learn how to customize the encoder to employ new network architectures."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "YYxdyoWgsl8t"
-      },
-      "source": [
-        "## Install and import"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "fEJSFutUsn_h"
-      },
-      "source": [
-        "### Install the TensorFlow Model Garden pip package\n",
-        "\n",
-        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
-        "which is the nightly Model Garden package created daily automatically.\n",
-        "*  `pip` will install all models and dependencies automatically."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "thsKZDjhswhR"
-      },
-      "source": [
-        "!pip install -q tf-models-official==2.4.0"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "hpf7JPCVsqtv"
-      },
-      "source": [
-        "### Import Tensorflow and other libraries"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "my4dp-RMssQe"
-      },
-      "source": [
-        "import numpy as np\n",
-        "import tensorflow as tf\n",
-        "\n",
-        "from official.modeling import activations\n",
-        "from official.nlp import modeling\n",
-        "from official.nlp.modeling import layers, losses, models, networks"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "vjDmVsFfs85n"
-      },
-      "source": [
-        "## Canonical BERT encoder\n",
-        "\n",
-        "Before learning how to customize the encoder, let's firstly create a canonical BERT enoder and use it to instantiate a `BertClassifier` for classification task."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Oav8sbgstWc-"
-      },
-      "source": [
-        "cfg = {\n",
-        "    \"vocab_size\": 100,\n",
-        "    \"hidden_size\": 32,\n",
-        "    \"num_layers\": 3,\n",
-        "    \"num_attention_heads\": 4,\n",
-        "    \"intermediate_size\": 64,\n",
-        "    \"activation\": activations.gelu,\n",
-        "    \"dropout_rate\": 0.1,\n",
-        "    \"attention_dropout_rate\": 0.1,\n",
-        "    \"max_sequence_length\": 16,\n",
-        "    \"type_vocab_size\": 2,\n",
-        "    \"initializer\": tf.keras.initializers.TruncatedNormal(stddev=0.02),\n",
-        "}\n",
-        "bert_encoder = modeling.networks.BertEncoder(**cfg)\n",
-        "\n",
-        "def build_classifier(bert_encoder):\n",
-        "  return modeling.models.BertClassifier(bert_encoder, num_classes=2)\n",
-        "\n",
-        "canonical_classifier_model = build_classifier(bert_encoder)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Qe2UWI6_tsHo"
-      },
-      "source": [
-        "`canonical_classifier_model` can be trained using the training data. For details about how to train the model, please see the colab [fine_tuning_bert.ipynb](https://github.com/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb). We skip the code that trains the model here.\n",
-        "\n",
-        "After training, we can apply the model to do prediction.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "csED2d-Yt5h6"
-      },
-      "source": [
-        "def predict(model):\n",
-        "  batch_size = 3\n",
-        "  np.random.seed(0)\n",
-        "  word_ids = np.random.randint(\n",
-        "      cfg[\"vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
-        "  mask = np.random.randint(2, size=(batch_size, cfg[\"max_sequence_length\"]))\n",
-        "  type_ids = np.random.randint(\n",
-        "      cfg[\"type_vocab_size\"], size=(batch_size, cfg[\"max_sequence_length\"]))\n",
-        "  print(model([word_ids, mask, type_ids], training=False))\n",
-        "\n",
-        "predict(canonical_classifier_model)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "PzKStEK9t_Pb"
-      },
-      "source": [
-        "## Customize BERT encoder\n",
-        "\n",
-        "One BERT encoder consists of an embedding network and multiple transformer blocks, and each transformer block contains an attention layer and a feedforward layer."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rmwQfhj6fmKz"
-      },
-      "source": [
-        "We provide easy ways to customize each of those components via (1)\n",
-        "[EncoderScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/encoder_scaffold.py) and (2) [TransformerScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py)."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xsMgEVHAui11"
-      },
-      "source": [
-        "### Use EncoderScaffold\n",
-        "\n",
-        "`EncoderScaffold` allows users to provide a custom embedding subnetwork\n",
-        "  (which will replace the standard embedding logic) and/or a custom hidden layer class (which will replace the `Transformer` instantiation in the encoder)."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "-JBabpa2AOz8"
-      },
-      "source": [
-        "#### Without Customization\n",
-        "\n",
-        "Without any customization, `EncoderScaffold` behaves the same the canonical `BertEncoder`.\n",
-        "\n",
-        "As shown in the following example, `EncoderScaffold` can load `BertEncoder`'s weights and output the same values:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "ktNzKuVByZQf"
-      },
-      "source": [
-        "default_hidden_cfg = dict(\n",
-        "    num_attention_heads=cfg[\"num_attention_heads\"],\n",
-        "    intermediate_size=cfg[\"intermediate_size\"],\n",
-        "    intermediate_activation=activations.gelu,\n",
-        "    dropout_rate=cfg[\"dropout_rate\"],\n",
-        "    attention_dropout_rate=cfg[\"attention_dropout_rate\"],\n",
-        "    kernel_initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
-        ")\n",
-        "default_embedding_cfg = dict(\n",
-        "    vocab_size=cfg[\"vocab_size\"],\n",
-        "    type_vocab_size=cfg[\"type_vocab_size\"],\n",
-        "    hidden_size=cfg[\"hidden_size\"],\n",
-        "    initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
-        "    dropout_rate=cfg[\"dropout_rate\"],\n",
-        "    max_seq_length=cfg[\"max_sequence_length\"]\n",
-        ")\n",
-        "default_kwargs = dict(\n",
-        "    hidden_cfg=default_hidden_cfg,\n",
-        "    embedding_cfg=default_embedding_cfg,\n",
-        "    num_hidden_instances=cfg[\"num_layers\"],\n",
-        "    pooled_output_dim=cfg[\"hidden_size\"],\n",
-        "    return_all_layer_outputs=True,\n",
-        "    pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(0.02),\n",
-        ")\n",
-        "\n",
-        "encoder_scaffold = modeling.networks.EncoderScaffold(**default_kwargs)\n",
-        "classifier_model_from_encoder_scaffold = build_classifier(encoder_scaffold)\n",
-        "classifier_model_from_encoder_scaffold.set_weights(\n",
-        "    canonical_classifier_model.get_weights())\n",
-        "predict(classifier_model_from_encoder_scaffold)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "sMaUmLyIuwcs"
-      },
-      "source": [
-        "#### Customize Embedding\n",
-        "\n",
-        "Next, we show how to use a customized embedding network.\n",
-        "\n",
-        "We firstly build an embedding network that will replace the default network. This one will have 2 inputs (`mask` and `word_ids`) instead of 3, and won't use positional embeddings."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "LTinnaG6vcsw"
-      },
-      "source": [
-        "word_ids = tf.keras.layers.Input(\n",
-        "    shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_word_ids\")\n",
-        "mask = tf.keras.layers.Input(\n",
-        "    shape=(cfg['max_sequence_length'],), dtype=tf.int32, name=\"input_mask\")\n",
-        "embedding_layer = modeling.layers.OnDeviceEmbedding(\n",
-        "    vocab_size=cfg['vocab_size'],\n",
-        "    embedding_width=cfg['hidden_size'],\n",
-        "    initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),\n",
-        "    name=\"word_embeddings\")\n",
-        "word_embeddings = embedding_layer(word_ids)\n",
-        "attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])\n",
-        "new_embedding_network = tf.keras.Model([word_ids, mask],\n",
-        "                                       [word_embeddings, attention_mask])"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "HN7_yu-6O3qI"
-      },
-      "source": [
-        "Inspecting `new_embedding_network`, we can see it takes two inputs:\n",
-        "`input_word_ids` and `input_mask`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "fO9zKFE4OpHp"
-      },
-      "source": [
-        "tf.keras.utils.plot_model(new_embedding_network, show_shapes=True, dpi=48)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9cOaGQHLv12W"
-      },
-      "source": [
-        "We then can build a new encoder using the above `new_embedding_network`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "mtFDMNf2vIl9"
-      },
-      "source": [
-        "kwargs = dict(default_kwargs)\n",
-        "\n",
-        "# Use new embedding network.\n",
-        "kwargs['embedding_cls'] = new_embedding_network\n",
-        "kwargs['embedding_data'] = embedding_layer.embeddings\n",
-        "\n",
-        "encoder_with_customized_embedding = modeling.networks.EncoderScaffold(**kwargs)\n",
-        "classifier_model = build_classifier(encoder_with_customized_embedding)\n",
-        "# ... Train the model ...\n",
-        "print(classifier_model.inputs)\n",
-        "\n",
-        "# Assert that there are only two inputs.\n",
-        "assert len(classifier_model.inputs) == 2"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Z73ZQDtmwg9K"
-      },
-      "source": [
-        "#### Customized Transformer\n",
-        "\n",
-        "User can also override the [hidden_cls](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/encoder_scaffold.py#L103) argument in `EncoderScaffold`'s constructor to employ a customized Transformer layer.\n",
-        "\n",
-        "See [ReZeroTransformer](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/rezero_transformer.py) for how to implement a customized Transformer layer.\n",
-        "\n",
-        "Following is an example of using `ReZeroTransformer`:\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "uAIarLZgw6pA"
-      },
-      "source": [
-        "kwargs = dict(default_kwargs)\n",
-        "\n",
-        "# Use ReZeroTransformer.\n",
-        "kwargs['hidden_cls'] = modeling.layers.ReZeroTransformer\n",
-        "\n",
-        "encoder_with_rezero_transformer = modeling.networks.EncoderScaffold(**kwargs)\n",
-        "classifier_model = build_classifier(encoder_with_rezero_transformer)\n",
-        "# ... Train the model ...\n",
-        "predict(classifier_model)\n",
-        "\n",
-        "# Assert that the variable `rezero_alpha` from ReZeroTransformer exists.\n",
-        "assert 'rezero_alpha' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6PMHFdvnxvR0"
-      },
-      "source": [
-        "### Use [TransformerScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py)\n",
-        "\n",
-        "The above method of customizing `Transformer` requires rewriting the whole `Transformer` layer, while sometimes you may only want to customize either attention layer or feedforward block. In this case, [TransformerScaffold](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py) can be used.\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "D6FejlgwyAy_"
-      },
-      "source": [
-        "#### Customize Attention Layer\n",
-        "\n",
-        "User can also override the [attention_cls](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/transformer_scaffold.py#L45) argument in `TransformerScaffold`'s constructor to employ a customized Attention layer.\n",
-        "\n",
-        "See [TalkingHeadsAttention](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/talking_heads_attention.py) for how to implement a customized `Attention` layer.\n",
-        "\n",
-        "Following is an example of using [TalkingHeadsAttention](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/talking_heads_attention.py):"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "nFrSMrZuyNeQ"
-      },
-      "source": [
-        "# Use TalkingHeadsAttention\n",
-        "hidden_cfg = dict(default_hidden_cfg)\n",
-        "hidden_cfg['attention_cls'] = modeling.layers.TalkingHeadsAttention\n",
-        "\n",
-        "kwargs = dict(default_kwargs)\n",
-        "kwargs['hidden_cls'] = modeling.layers.TransformerScaffold\n",
-        "kwargs['hidden_cfg'] = hidden_cfg\n",
-        "\n",
-        "encoder = modeling.networks.EncoderScaffold(**kwargs)\n",
-        "classifier_model = build_classifier(encoder)\n",
-        "# ... Train the model ...\n",
-        "predict(classifier_model)\n",
-        "\n",
-        "# Assert that the variable `pre_softmax_weight` from TalkingHeadsAttention exists.\n",
-        "assert 'pre_softmax_weight' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "kuEJcTyByVvI"
-      },
-      "source": [
-        "#### Customize Feedforward Layer\n",
-        "\n",
-        "Similiarly, one could also customize the feedforward layer.\n",
-        "\n",
-        "See [GatedFeedforward](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/gated_feedforward.py) for how to implement a customized feedforward layer.\n",
-        "\n",
-        "Following is an example of using [GatedFeedforward](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/gated_feedforward.py)."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "XAbKy_l4y_-i"
-      },
-      "source": [
-        "# Use TalkingHeadsAttention\n",
-        "hidden_cfg = dict(default_hidden_cfg)\n",
-        "hidden_cfg['feedforward_cls'] = modeling.layers.GatedFeedforward\n",
-        "\n",
-        "kwargs = dict(default_kwargs)\n",
-        "kwargs['hidden_cls'] = modeling.layers.TransformerScaffold\n",
-        "kwargs['hidden_cfg'] = hidden_cfg\n",
-        "\n",
-        "encoder_with_gated_feedforward = modeling.networks.EncoderScaffold(**kwargs)\n",
-        "classifier_model = build_classifier(encoder_with_gated_feedforward)\n",
-        "# ... Train the model ...\n",
-        "predict(classifier_model)\n",
-        "\n",
-        "# Assert that the variable `gate` from GatedFeedforward exists.\n",
-        "assert 'gate' in ''.join([x.name for x in classifier_model.trainable_weights])"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "a_8NWUhkzeAq"
-      },
-      "source": [
-        "### Build a new Encoder using building blocks from KerasBERT.\n",
-        "\n",
-        "Finally, you could also build a new encoder using building blocks in the modeling library.\n",
-        "\n",
-        "See [AlbertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/albert_encoder.py) as an example:\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "xsiA3RzUzmUM"
-      },
-      "source": [
-        "albert_encoder = modeling.networks.AlbertEncoder(**cfg)\n",
-        "classifier_model = build_classifier(albert_encoder)\n",
-        "# ... Train the model ...\n",
-        "predict(classifier_model)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MeidDfhlHKSO"
-      },
-      "source": [
-        "Inspecting the `albert_encoder`, we see it stacks the same `Transformer` layer multiple times."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Uv_juT22HERW"
-      },
-      "source": [
-        "tf.keras.utils.plot_model(albert_encoder, show_shapes=True, dpi=48)"
-      ],
-      "execution_count": null,
-      "outputs": []
-    }
-  ]
-}
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/nlp/nlp_modeling_library_intro.ipynb b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/nlp/nlp_modeling_library_intro.ipynb
deleted file mode 100644
index e4ce780c96bfbf679c91891f38b08ac3b0bb983e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/colab/nlp/nlp_modeling_library_intro.ipynb
+++ /dev/null
@@ -1,544 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "80xnUmoI7fBX"
-      },
-      "source": [
-        "##### Copyright 2020 The TensorFlow Authors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "8nvTnfs6Q692"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
-        "# you may not use this file except in compliance with the License.\n",
-        "# You may obtain a copy of the License at\n",
-        "#\n",
-        "# https://www.apache.org/licenses/LICENSE-2.0\n",
-        "#\n",
-        "# Unless required by applicable law or agreed to in writing, software\n",
-        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
-        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
-        "# See the License for the specific language governing permissions and\n",
-        "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "WmfcMK5P5C1G"
-      },
-      "source": [
-        "# Introduction to the TensorFlow Models NLP library"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "cH-oJ8R6AHMK"
-      },
-      "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/official_models/nlp/nlp_modeling_library_intro\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/models/official/colab/nlp/nlp_modeling_library_intro.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0H_EFIhq4-MJ"
-      },
-      "source": [
-        "## Learning objectives\n",
-        "\n",
-        "In this Colab notebook, you will learn how to build transformer-based models for common NLP tasks including pretraining, span labelling and classification using the building blocks from [NLP modeling library](https://github.com/tensorflow/models/tree/master/official/nlp/modeling)."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2N97-dps_nUk"
-      },
-      "source": [
-        "## Install and import"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "459ygAVl_rg0"
-      },
-      "source": [
-        "### Install the TensorFlow Model Garden pip package\n",
-        "\n",
-        "*  `tf-models-official` is the stable Model Garden package. Note that it may not include the latest changes in the `tensorflow_models` github repo. To include latest changes, you may install `tf-models-nightly`,\n",
-        "which is the nightly Model Garden package created daily automatically.\n",
-        "*  `pip` will install all models and dependencies automatically."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "Y-qGkdh6_sZc"
-      },
-      "outputs": [],
-      "source": [
-        "!pip install -q tf-models-official==2.4.0"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "e4huSSwyAG_5"
-      },
-      "source": [
-        "### Import Tensorflow and other libraries"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "jqYXqtjBAJd9"
-      },
-      "outputs": [],
-      "source": [
-        "import numpy as np\n",
-        "import tensorflow as tf\n",
-        "\n",
-        "from official.nlp import modeling\n",
-        "from official.nlp.modeling import layers, losses, models, networks"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "djBQWjvy-60Y"
-      },
-      "source": [
-        "## BERT pretraining model\n",
-        "\n",
-        "BERT ([Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)) introduced the method of pre-training language representations on a large text corpus and then using that model for downstream NLP tasks.\n",
-        "\n",
-        "In this section, we will learn how to build a model to pretrain BERT on the masked language modeling task and next sentence prediction task. For simplicity, we only show the minimum example and use dummy data."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MKuHVlsCHmiq"
-      },
-      "source": [
-        "### Build a `BertPretrainer` model wrapping `BertEncoder`\n",
-        "\n",
-        "The [BertEncoder](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/bert_encoder.py) implements the Transformer-based encoder as described in [BERT paper](https://arxiv.org/abs/1810.04805). It includes the embedding lookups and transformer layers, but not the masked language model or classification task networks.\n",
-        "\n",
-        "The [BertPretrainer](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_pretrainer.py) allows a user to pass in a transformer stack, and instantiates the masked language model and classification networks that are used to create the training objectives."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "EXkcXz-9BwB3"
-      },
-      "outputs": [],
-      "source": [
-        "# Build a small transformer network.\n",
-        "vocab_size = 100\n",
-        "sequence_length = 16\n",
-        "network = modeling.networks.BertEncoder(\n",
-        "    vocab_size=vocab_size, num_layers=2, sequence_length=16)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0NH5irV5KTMS"
-      },
-      "source": [
-        "Inspecting the encoder, we see it contains few embedding layers, stacked `Transformer` layers and are connected to three input layers:\n",
-        "\n",
-        "`input_word_ids`, `input_type_ids` and `input_mask`.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "lZNoZkBrIoff"
-      },
-      "outputs": [],
-      "source": [
-        "tf.keras.utils.plot_model(network, show_shapes=True, dpi=48)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "o7eFOZXiIl-b"
-      },
-      "outputs": [],
-      "source": [
-        "# Create a BERT pretrainer with the created network.\n",
-        "num_token_predictions = 8\n",
-        "bert_pretrainer = modeling.models.BertPretrainer(\n",
-        "    network, num_classes=2, num_token_predictions=num_token_predictions, output='predictions')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "d5h5HT7gNHx_"
-      },
-      "source": [
-        "Inspecting the `bert_pretrainer`, we see it wraps the `encoder` with additional `MaskedLM` and `Classification` heads."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "2tcNfm03IBF7"
-      },
-      "outputs": [],
-      "source": [
-        "tf.keras.utils.plot_model(bert_pretrainer, show_shapes=True, dpi=48)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "F2oHrXGUIS0M"
-      },
-      "outputs": [],
-      "source": [
-        "# We can feed some dummy data to get masked language model and sentence output.\n",
-        "batch_size = 2\n",
-        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
-        "mask_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
-        "type_id_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
-        "masked_lm_positions_data = np.random.randint(2, size=(batch_size, num_token_predictions))\n",
-        "\n",
-        "outputs = bert_pretrainer(\n",
-        "    [word_id_data, mask_data, type_id_data, masked_lm_positions_data])\n",
-        "lm_output = outputs[\"masked_lm\"]\n",
-        "sentence_output = outputs[\"classification\"]\n",
-        "print(lm_output)\n",
-        "print(sentence_output)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "bnx3UCHniCS5"
-      },
-      "source": [
-        "### Compute loss\n",
-        "Next, we can use `lm_output` and `sentence_output` to compute `loss`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "k30H4Q86f52x"
-      },
-      "outputs": [],
-      "source": [
-        "masked_lm_ids_data = np.random.randint(vocab_size, size=(batch_size, num_token_predictions))\n",
-        "masked_lm_weights_data = np.random.randint(2, size=(batch_size, num_token_predictions))\n",
-        "next_sentence_labels_data = np.random.randint(2, size=(batch_size))\n",
-        "\n",
-        "mlm_loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
-        "    labels=masked_lm_ids_data,\n",
-        "    predictions=lm_output,\n",
-        "    weights=masked_lm_weights_data)\n",
-        "sentence_loss = modeling.losses.weighted_sparse_categorical_crossentropy_loss(\n",
-        "    labels=next_sentence_labels_data,\n",
-        "    predictions=sentence_output)\n",
-        "loss = mlm_loss + sentence_loss\n",
-        "print(loss)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "wrmSs8GjHxVw"
-      },
-      "source": [
-        "With the loss, you can optimize the model.\n",
-        "After training, we can save the weights of TransformerEncoder for the downstream fine-tuning tasks. Please see [run_pretraining.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_pretraining.py) for the full example.\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "k8cQVFvBCV4s"
-      },
-      "source": [
-        "## Span labeling model\n",
-        "\n",
-        "Span labeling is the task to assign labels to a span of the text, for example, label a span of text as the answer of a given question.\n",
-        "\n",
-        "In this section, we will learn how to build a span labeling model. Again, we use dummy data for simplicity."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "xrLLEWpfknUW"
-      },
-      "source": [
-        "### Build a BertSpanLabeler wrapping BertEncoder\n",
-        "\n",
-        "[BertSpanLabeler](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_span_labeler.py) implements a simple single-span start-end predictor (that is, a model that predicts two values: a start token index and an end token index), suitable for SQuAD-style tasks.\n",
-        "\n",
-        "Note that `BertSpanLabeler` wraps a `BertEncoder`, the weights of which can be restored from the above pretraining model.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "B941M4iUCejO"
-      },
-      "outputs": [],
-      "source": [
-        "network = modeling.networks.BertEncoder(\n",
-        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
-        "\n",
-        "# Create a BERT trainer with the created network.\n",
-        "bert_span_labeler = modeling.models.BertSpanLabeler(network)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "QpB9pgj4PpMg"
-      },
-      "source": [
-        "Inspecting the `bert_span_labeler`, we see it wraps the encoder with additional `SpanLabeling` that outputs `start_position` and `end_postion`."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "RbqRNJCLJu4H"
-      },
-      "outputs": [],
-      "source": [
-        "tf.keras.utils.plot_model(bert_span_labeler, show_shapes=True, dpi=48)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "fUf1vRxZJwio"
-      },
-      "outputs": [],
-      "source": [
-        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
-        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
-        "mask_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
-        "type_id_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
-        "\n",
-        "# Feed the data to the model.\n",
-        "start_logits, end_logits = bert_span_labeler([word_id_data, mask_data, type_id_data])\n",
-        "print(start_logits)\n",
-        "print(end_logits)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "WqhgQaN1lt-G"
-      },
-      "source": [
-        "### Compute loss\n",
-        "With `start_logits` and `end_logits`, we can compute loss:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "waqs6azNl3Nn"
-      },
-      "outputs": [],
-      "source": [
-        "start_positions = np.random.randint(sequence_length, size=(batch_size))\n",
-        "end_positions = np.random.randint(sequence_length, size=(batch_size))\n",
-        "\n",
-        "start_loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
-        "    start_positions, start_logits, from_logits=True)\n",
-        "end_loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
-        "    end_positions, end_logits, from_logits=True)\n",
-        "\n",
-        "total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2\n",
-        "print(total_loss)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Zdf03YtZmd_d"
-      },
-      "source": [
-        "With the `loss`, you can optimize the model. Please see [run_squad.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_squad.py) for the full example."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0A1XnGSTChg9"
-      },
-      "source": [
-        "## Classification model\n",
-        "\n",
-        "In the last section, we show how to build a text classification model.\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "MSK8OpZgnQa9"
-      },
-      "source": [
-        "### Build a BertClassifier model wrapping BertEncoder\n",
-        "\n",
-        "[BertClassifier](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/models/bert_classifier.py) implements a [CLS] token classification model containing a single classification head."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "cXXCsffkCphk"
-      },
-      "outputs": [],
-      "source": [
-        "network = modeling.networks.BertEncoder(\n",
-        "        vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)\n",
-        "\n",
-        "# Create a BERT trainer with the created network.\n",
-        "num_classes = 2\n",
-        "bert_classifier = modeling.models.BertClassifier(\n",
-        "    network, num_classes=num_classes)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8tZKueKYP4bB"
-      },
-      "source": [
-        "Inspecting the `bert_classifier`, we see it wraps the `encoder` with additional `Classification` head."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "snlutm9ZJgEZ"
-      },
-      "outputs": [],
-      "source": [
-        "tf.keras.utils.plot_model(bert_classifier, show_shapes=True, dpi=48)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "yyHPHsqBJkCz"
-      },
-      "outputs": [],
-      "source": [
-        "# Create a set of 2-dimensional data tensors to feed into the model.\n",
-        "word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))\n",
-        "mask_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
-        "type_id_data = np.random.randint(2, size=(batch_size, sequence_length))\n",
-        "\n",
-        "# Feed the data to the model.\n",
-        "logits = bert_classifier([word_id_data, mask_data, type_id_data])\n",
-        "print(logits)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "w--a2mg4nzKm"
-      },
-      "source": [
-        "### Compute loss\n",
-        "\n",
-        "With `logits`, we can compute `loss`:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "9X0S1DoFn_5Q"
-      },
-      "outputs": [],
-      "source": [
-        "labels = np.random.randint(num_classes, size=(batch_size))\n",
-        "\n",
-        "loss = tf.keras.losses.sparse_categorical_crossentropy(\n",
-        "    labels, logits, from_logits=True)\n",
-        "print(loss)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mzBqOylZo3og"
-      },
-      "source": [
-        "With the `loss`, you can optimize the model. Please see [run_classifier.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_classifier.py) or the colab [fine_tuning_bert.ipynb](https://github.com/tensorflow/models/blob/master/official/colab/fine_tuning_bert.ipynb) for the full example."
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "Introduction to the TensorFlow Models NLP library",
-      "private_outputs": true,
-      "provenance": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/__init__.py
deleted file mode 100644
index 3ef7bb85ba5f722a4f34e90623470d5a45af3aa4..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/dataset_fn.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/dataset_fn.py
deleted file mode 100644
index fb6a5b42d034e8fdb1a2c2027def08cf65f35afe..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/dataset_fn.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ==============================================================================
-"""Utility library for picking an appropriate dataset function."""
-
-from typing import Any, Callable, Union, Type
-
-import tensorflow as tf
-
-PossibleDatasetType = Union[Type[tf.data.Dataset], Callable[[tf.Tensor], Any]]
-
-
-def pick_dataset_fn(file_type: str) -> PossibleDatasetType:
-  if file_type == 'tfrecord':
-    return tf.data.TFRecordDataset
-
-  raise ValueError('Unrecognized file_type: {}'.format(file_type))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/distribute_utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/distribute_utils.py
deleted file mode 100644
index c484e0bfa2704481db8dab695bd4d2426c1ebbce..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/distribute_utils.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Helper functions for running models in a distributed setting."""
-
-import json
-import os
-import tensorflow as tf
-
-
-def _collective_communication(all_reduce_alg):
-  """Return a CollectiveCommunication based on all_reduce_alg.
-
-  Args:
-    all_reduce_alg: a string specifying which collective communication to pick,
-      or None.
-
-  Returns:
-    tf.distribute.experimental.CollectiveCommunication object
-
-  Raises:
-    ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
-  """
-  collective_communication_options = {
-      None: tf.distribute.experimental.CollectiveCommunication.AUTO,
-      "ring": tf.distribute.experimental.CollectiveCommunication.RING,
-      "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
-  }
-  if all_reduce_alg not in collective_communication_options:
-    raise ValueError(
-        "When used with `multi_worker_mirrored`, valid values for "
-        "all_reduce_alg are [`ring`, `nccl`].  Supplied value: {}".format(
-            all_reduce_alg))
-  return collective_communication_options[all_reduce_alg]
-
-
-def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
-  """Return a CrossDeviceOps based on all_reduce_alg and num_packs.
-
-  Args:
-    all_reduce_alg: a string specifying which cross device op to pick, or None.
-    num_packs: an integer specifying number of packs for the cross device op.
-
-  Returns:
-    tf.distribute.CrossDeviceOps object or None.
-
-  Raises:
-    ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
-  """
-  if all_reduce_alg is None:
-    return None
-  mirrored_all_reduce_options = {
-      "nccl": tf.distribute.NcclAllReduce,
-      "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
-  }
-  if all_reduce_alg not in mirrored_all_reduce_options:
-    raise ValueError(
-        "When used with `mirrored`, valid values for all_reduce_alg are "
-        "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(
-            all_reduce_alg))
-  cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
-  return cross_device_ops_class(num_packs=num_packs)
-
-
-def tpu_initialize(tpu_address):
-  """Initializes TPU for TF 2.x training.
-
-  Args:
-    tpu_address: string, bns address of master TPU worker.
-
-  Returns:
-    A TPUClusterResolver.
-  """
-  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-      tpu=tpu_address)
-  if tpu_address not in ("", "local"):
-    tf.config.experimental_connect_to_cluster(cluster_resolver)
-  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
-  return cluster_resolver
-
-
-def get_distribution_strategy(distribution_strategy="mirrored",
-                              num_gpus=0,
-                              all_reduce_alg=None,
-                              num_packs=1,
-                              tpu_address=None,
-                              **kwargs):
-  """Return a DistributionStrategy for running the model.
-
-  Args:
-    distribution_strategy: a string specifying which distribution strategy to
-      use. Accepted values are "off", "one_device", "mirrored",
-      "parameter_server", "multi_worker_mirrored", and "tpu" -- case
-      insensitive. "off" means not to use Distribution Strategy; "tpu" means to
-      use TPUStrategy using `tpu_address`.
-    num_gpus: Number of GPUs to run this model.
-    all_reduce_alg: Optional. Specifies which algorithm to use when performing
-      all-reduce. For `MirroredStrategy`, valid values are "nccl" and
-      "hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are
-      "ring" and "nccl".  If None, DistributionStrategy will choose based on
-      device topology.
-    num_packs: Optional.  Sets the `num_packs` in `tf.distribute.NcclAllReduce`
-      or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
-    tpu_address: Optional. String that represents TPU to connect to. Must not be
-      None if `distribution_strategy` is set to `tpu`.
-    **kwargs: Additional kwargs for internal usages.
-
-  Returns:
-    tf.distribute.DistibutionStrategy object.
-  Raises:
-    ValueError: if `distribution_strategy` is "off" or "one_device" and
-      `num_gpus` is larger than 1; or `num_gpus` is negative or if
-      `distribution_strategy` is `tpu` but `tpu_address` is not specified.
-  """
-  del kwargs
-  if num_gpus < 0:
-    raise ValueError("`num_gpus` can not be negative.")
-
-  if not isinstance(distribution_strategy, str):
-    msg = ("distribution_strategy must be a string but got: %s." %
-           (distribution_strategy,))
-    if distribution_strategy == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
-      msg += (" If you meant to pass the string 'off', make sure you add "
-              "quotes around 'off' so that yaml interprets it as a string "
-              "instead of a bool.")
-    raise ValueError(msg)
-
-  distribution_strategy = distribution_strategy.lower()
-  if distribution_strategy == "off":
-    if num_gpus > 1:
-      raise ValueError("When {} GPUs are specified, distribution_strategy "
-                       "flag cannot be set to `off`.".format(num_gpus))
-    return None
-
-  if distribution_strategy == "tpu":
-    # When tpu_address is an empty string, we communicate with local TPUs.
-    cluster_resolver = tpu_initialize(tpu_address)
-    return tf.distribute.TPUStrategy(cluster_resolver)
-
-  if distribution_strategy == "multi_worker_mirrored":
-    return tf.distribute.experimental.MultiWorkerMirroredStrategy(
-        communication=_collective_communication(all_reduce_alg))
-
-  if distribution_strategy == "one_device":
-    if num_gpus == 0:
-      return tf.distribute.OneDeviceStrategy("device:CPU:0")
-    if num_gpus > 1:
-      raise ValueError("`OneDeviceStrategy` can not be used for more than "
-                       "one device.")
-    return tf.distribute.OneDeviceStrategy("device:GPU:0")
-
-  if distribution_strategy == "mirrored":
-    if num_gpus == 0:
-      devices = ["device:CPU:0"]
-    else:
-      devices = ["device:GPU:%d" % i for i in range(num_gpus)]
-    return tf.distribute.MirroredStrategy(
-        devices=devices,
-        cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))
-
-  if distribution_strategy == "parameter_server":
-    cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
-    return tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
-
-  raise ValueError("Unrecognized Distribution Strategy: %r" %
-                   distribution_strategy)
-
-
-def configure_cluster(worker_hosts=None, task_index=-1):
-  """Set multi-worker cluster spec in TF_CONFIG environment variable.
-
-  Args:
-    worker_hosts: comma-separated list of worker ip:port pairs.
-    task_index: index of the worker.
-
-  Returns:
-    Number of workers in the cluster.
-  """
-  tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
-  if tf_config:
-    num_workers = (
-        len(tf_config["cluster"].get("chief", [])) +
-        len(tf_config["cluster"].get("worker", [])))
-  elif worker_hosts:
-    workers = worker_hosts.split(",")
-    num_workers = len(workers)
-    if num_workers > 1 and task_index < 0:
-      raise ValueError("Must specify task_index when number of workers > 1")
-    task_index = 0 if num_workers == 1 else task_index
-    os.environ["TF_CONFIG"] = json.dumps({
-        "cluster": {
-            "worker": workers
-        },
-        "task": {
-            "type": "worker",
-            "index": task_index
-        }
-    })
-  else:
-    num_workers = 1
-  return num_workers
-
-
-def get_strategy_scope(strategy):
-  if strategy:
-    strategy_scope = strategy.scope()
-  else:
-    strategy_scope = DummyContextManager()
-
-  return strategy_scope
-
-
-class DummyContextManager(object):
-
-  def __enter__(self):
-    pass
-
-  def __exit__(self, *args):
-    pass
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/distribute_utils_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/distribute_utils_test.py
deleted file mode 100644
index a8c3bfc1775d2c895eed0837dcfa36440b803ee2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/distribute_utils_test.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-""" Tests for distribution util functions."""
-
-import tensorflow as tf
-
-from official.common import distribute_utils
-
-
-class GetDistributionStrategyTest(tf.test.TestCase):
-  """Tests for get_distribution_strategy."""
-
-  def test_one_device_strategy_cpu(self):
-    ds = distribute_utils.get_distribution_strategy(num_gpus=0)
-    self.assertEquals(ds.num_replicas_in_sync, 1)
-    self.assertEquals(len(ds.extended.worker_devices), 1)
-    self.assertIn('CPU', ds.extended.worker_devices[0])
-
-  def test_one_device_strategy_gpu(self):
-    ds = distribute_utils.get_distribution_strategy(num_gpus=1)
-    self.assertEquals(ds.num_replicas_in_sync, 1)
-    self.assertEquals(len(ds.extended.worker_devices), 1)
-    self.assertIn('GPU', ds.extended.worker_devices[0])
-
-  def test_mirrored_strategy(self):
-    ds = distribute_utils.get_distribution_strategy(num_gpus=5)
-    self.assertEquals(ds.num_replicas_in_sync, 5)
-    self.assertEquals(len(ds.extended.worker_devices), 5)
-    for device in ds.extended.worker_devices:
-      self.assertIn('GPU', device)
-
-  def test_no_strategy(self):
-    ds = distribute_utils.get_distribution_strategy('off')
-    self.assertIsNone(ds)
-
-  def test_invalid_strategy(self):
-    with self.assertRaisesRegexp(
-        ValueError,
-        'distribution_strategy must be a string but got: False. If'):
-      distribute_utils.get_distribution_strategy(False)
-    with self.assertRaisesRegexp(
-        ValueError, 'distribution_strategy must be a string but got: 1'):
-      distribute_utils.get_distribution_strategy(1)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/flags.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/flags.py
deleted file mode 100644
index d7216ee0946b7275c3515630871b188d1d464adb..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/flags.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""The central place to define flags."""
-
-from absl import flags
-
-
-def define_flags():
-  """Defines flags."""
-  flags.DEFINE_string(
-      'experiment', default=None, help='The experiment type registered.')
-
-  flags.DEFINE_enum(
-      'mode',
-      default=None,
-      enum_values=[
-          'train', 'eval', 'train_and_eval', 'continuous_eval',
-          'continuous_train_and_eval', 'train_and_validate'
-      ],
-      help='Mode to run: `train`, `eval`, `train_and_eval`, '
-      '`continuous_eval`, `continuous_train_and_eval` and '
-      '`train_and_validate` (which is not implemented in '
-      'the open source version).')
-
-  flags.DEFINE_string(
-      'model_dir',
-      default=None,
-      help='The directory where the model and training/evaluation summaries'
-      'are stored.')
-
-  flags.DEFINE_multi_string(
-      'config_file',
-      default=None,
-      help='YAML/JSON files which specifies overrides. The override order '
-      'follows the order of args. Note that each file '
-      'can be used as an override template to override the default parameters '
-      'specified in Python. If the same parameter is specified in both '
-      '`--config_file` and `--params_override`, `config_file` will be used '
-      'first, followed by params_override.')
-
-  flags.DEFINE_string(
-      'params_override',
-      default=None,
-      help='a YAML/JSON string or a YAML file which specifies additional '
-      'overrides over the default parameters and those specified in '
-      '`--config_file`. Note that this is supposed to be used only to override '
-      'the model parameters, but not the parameters like TPU specific flags. '
-      'One canonical use case of `--config_file` and `--params_override` is '
-      'users first define a template config file using `--config_file`, then '
-      'use `--params_override` to adjust the minimal set of tuning parameters, '
-      'for example setting up different `train_batch_size`. The final override '
-      'order of parameters: default_model_params --> params from config_file '
-      '--> params in params_override. See also the help message of '
-      '`--config_file`.')
-
-  # The libraries rely on gin often make mistakes that include flags inside
-  # the library files which causes conflicts.
-  try:
-    flags.DEFINE_multi_string(
-        'gin_file', default=None, help='List of paths to the config files.')
-  except flags.DuplicateFlagError:
-    pass
-
-  try:
-    flags.DEFINE_multi_string(
-        'gin_params',
-        default=None,
-        help='Newline separated list of Gin parameter bindings.')
-  except flags.DuplicateFlagError:
-    pass
-
-  flags.DEFINE_string(
-      'tpu',
-      default=None,
-      help='The Cloud TPU to use for training. This should be either the name '
-      'used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 '
-      'url.')
-
-  flags.DEFINE_string(
-      'tf_data_service', default=None, help='The tf.data service address')
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/registry_imports.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/registry_imports.py
deleted file mode 100644
index 653bedaac206c04d1856c43a7bbbc7ba4bfb8534..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/common/registry_imports.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""All necessary imports for registration."""
-# pylint: disable=unused-import
-from official.nlp import tasks
-from official.nlp.configs import experiment_configs
-from official.utils.testing import mock_task
-from official.vision import beta
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/base_task.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/base_task.py
deleted file mode 100644
index 95558edf8696bee9a84c4f3c8339bc78dd8f30e3..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/base_task.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Defines the base task abstraction."""
-import abc
-from typing import Optional
-
-from absl import logging
-import tensorflow as tf
-
-from official.core import config_definitions
-from official.modeling import optimization
-from official.modeling import performance
-
-OptimizationConfig = optimization.OptimizationConfig
-RuntimeConfig = config_definitions.RuntimeConfig
-
-
-class Task(tf.Module, metaclass=abc.ABCMeta):
-  """A single-replica view of training procedure.
-
-  Tasks provide artifacts for training/validation procedures, including
-  loading/iterating over Datasets, training/validation steps, calculating the
-  loss and customized metrics with reduction.
-  """
-
-  # Special keys in train/validate step returned logs.
-  loss = "loss"
-
-  def __init__(self, params, logging_dir: str = None, name: str = None):
-    """Task initialization.
-
-    Args:
-      params: the task configuration instance, which can be any of dataclass,
-        ConfigDict, namedtuple, etc.
-      logging_dir: a string pointing to where the model, summaries etc. will be
-        saved. You can also write additional stuff in this directory.
-      name: the task name.
-    """
-    super().__init__(name=name)
-    self._task_config = params
-    self._logging_dir = logging_dir
-
-  @property
-  def task_config(self):
-    return self._task_config
-
-  @property
-  def logging_dir(self) -> str:
-    return self._logging_dir
-
-  @classmethod
-  def create_optimizer(cls, optimizer_config: OptimizationConfig,
-                       runtime_config: Optional[RuntimeConfig] = None):
-    """Creates an TF optimizer from configurations.
-
-    Args:
-      optimizer_config: the parameters of the Optimization settings.
-      runtime_config: the parameters of the runtime.
-
-    Returns:
-      A tf.optimizers.Optimizer object.
-    """
-    opt_factory = optimization.OptimizerFactory(optimizer_config)
-    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
-    # Configuring optimizer when loss_scale is set in runtime config. This helps
-    # avoiding overflow/underflow for float16 computations.
-    if runtime_config and runtime_config.loss_scale:
-      optimizer = performance.configure_optimizer(
-          optimizer,
-          use_float16=runtime_config.mixed_precision_dtype == "float16",
-          loss_scale=runtime_config.loss_scale)
-
-    return optimizer
-
-  def initialize(self, model: tf.keras.Model):
-    """[Optional] A callback function used as CheckpointManager's init_fn.
-
-    This function will be called when no checkpoint is found for the model.
-    If there is a checkpoint, the checkpoint will be loaded and this function
-    will not be called. You can use this callback function to load a pretrained
-    checkpoint, saved under a directory other than the model_dir.
-
-    Args:
-      model: The keras.Model built or used by this task.
-    """
-    ckpt_dir_or_file = self.task_config.init_checkpoint
-    logging.info("Trying to load pretrained checkpoint from %s",
-                 ckpt_dir_or_file)
-    if tf.io.gfile.isdir(ckpt_dir_or_file):
-      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
-    if not ckpt_dir_or_file:
-      return
-
-    if hasattr(model, "checkpoint_items"):
-      checkpoint_items = model.checkpoint_items
-    else:
-      checkpoint_items = dict(model=model)
-    ckpt = tf.train.Checkpoint(**checkpoint_items)
-    status = ckpt.read(ckpt_dir_or_file)
-    status.expect_partial().assert_existing_objects_matched()
-    logging.info("Finished loading pretrained checkpoint from %s",
-                 ckpt_dir_or_file)
-
-  def build_model(self) -> tf.keras.Model:
-    """[Optional] Creates model architecture.
-
-    Returns:
-      A model instance.
-    """
-
-  @abc.abstractmethod
-  def build_inputs(self,
-                   params,
-                   input_context: Optional[tf.distribute.InputContext] = None):
-    """Returns a dataset or a nested structure of dataset functions.
-
-    Dataset functions define per-host datasets with the per-replica batch size.
-    With distributed training, this method runs on remote hosts.
-
-    Args:
-      params: hyperparams to create input pipelines, which can be any of
-        dataclass, ConfigDict, namedtuple, etc.
-      input_context: optional distribution input pipeline context.
-
-    Returns:
-      A nested structure of per-replica input functions.
-    """
-
-  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    """Standard interface to compute losses.
-
-    Args:
-      labels: optional label tensors.
-      model_outputs: a nested structure of output tensors.
-      aux_losses: auxiliary loss tensors, i.e. `losses` in keras.Model.
-
-    Returns:
-      The total loss tensor.
-    """
-    del model_outputs, labels
-
-    if aux_losses is None:
-      losses = [tf.constant(0.0, dtype=tf.float32)]
-    else:
-      losses = aux_losses
-    total_loss = tf.add_n(losses)
-    return total_loss
-
-  def build_metrics(self, training: bool = True):
-    """Gets streaming metrics for training/validation."""
-    del training
-    return []
-
-  def process_metrics(self, metrics, labels, model_outputs):
-    """Process and update metrics.
-
-    Called when using custom training loop API.
-
-    Args:
-      metrics: a nested structure of metrics objects. The return of function
-        self.build_metrics.
-      labels: a tensor or a nested structure of tensors.
-      model_outputs: a tensor or a nested structure of tensors. For example,
-        output of the keras model built by self.build_model.
-    """
-    for metric in metrics:
-      metric.update_state(labels, model_outputs)
-
-  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
-    """Process and update compiled_metrics.
-
-    call when using compile/fit API.
-
-    Args:
-      compiled_metrics: the compiled metrics (model.compiled_metrics).
-      labels: a tensor or a nested structure of tensors.
-      model_outputs: a tensor or a nested structure of tensors. For example,
-        output of the keras model built by self.build_model.
-    """
-    compiled_metrics.update_state(labels, model_outputs)
-
-  def train_step(self,
-                 inputs,
-                 model: tf.keras.Model,
-                 optimizer: tf.keras.optimizers.Optimizer,
-                 metrics=None):
-    """Does forward and backward.
-
-    With distribution strategies, this method runs on devices.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the model, forward pass definition.
-      optimizer: the optimizer for this training step.
-      metrics: a nested structure of metrics objects.
-
-    Returns:
-      A dictionary of logs.
-    """
-    if isinstance(inputs, tuple) and len(inputs) == 2:
-      features, labels = inputs
-    else:
-      features, labels = inputs, inputs
-    with tf.GradientTape() as tape:
-      outputs = model(features, training=True)
-      # Computes per-replica loss.
-      if model.compiled_loss:
-        loss = model.compiled_loss(
-            labels, outputs, regularization_losses=model.losses)
-        loss += self.build_losses(
-            labels=labels, model_outputs=outputs, aux_losses=None)
-      else:
-        loss = self.build_losses(
-            labels=labels, model_outputs=outputs, aux_losses=model.losses)
-      # Scales loss as the default gradients allreduce performs sum inside the
-      # optimizer.
-      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
-
-      # For mixed precision, when a LossScaleOptimizer is used, the loss is
-      # scaled to avoid numeric underflow.
-      if isinstance(optimizer,
-                    tf.keras.mixed_precision.LossScaleOptimizer):
-        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
-
-    tvars = model.trainable_variables
-    grads = tape.gradient(scaled_loss, tvars)
-
-    if isinstance(optimizer,
-                  tf.keras.mixed_precision.LossScaleOptimizer):
-      grads = optimizer.get_unscaled_gradients(grads)
-    optimizer.apply_gradients(list(zip(grads, tvars)))
-    logs = {self.loss: loss}
-    if metrics:
-      self.process_metrics(metrics, labels, outputs)
-    if model.compiled_metrics:
-      self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
-      logs.update({m.name: m.result() for m in metrics or []})
-      logs.update({m.name: m.result() for m in model.metrics})
-    return logs
-
-  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
-    """Validation step.
-
-    With distribution strategies, this method runs on devices.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the keras.Model.
-      metrics: a nested structure of metrics objects.
-
-    Returns:
-      A dictionary of logs.
-    """
-    if isinstance(inputs, tuple) and len(inputs) == 2:
-      features, labels = inputs
-    else:
-      features, labels = inputs, inputs
-    outputs = self.inference_step(features, model)
-    loss = self.build_losses(
-        labels=labels, model_outputs=outputs, aux_losses=model.losses)
-    logs = {self.loss: loss}
-    if metrics:
-      self.process_metrics(metrics, labels, outputs)
-    if model.compiled_metrics:
-      self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
-      logs.update({m.name: m.result() for m in metrics or []})
-      logs.update({m.name: m.result() for m in model.metrics})
-    return logs
-
-  def inference_step(self, inputs, model: tf.keras.Model):
-    """Performs the forward step.
-
-    With distribution strategies, this method runs on devices.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the keras.Model.
-
-    Returns:
-      Model outputs.
-    """
-    return model(inputs, training=False)
-
-  def aggregate_logs(self, state, step_logs):
-    """Optional aggregation over logs returned from a validation step."""
-    pass
-
-  def reduce_aggregated_logs(self,
-                             aggregated_logs,
-                             global_step: Optional[tf.Tensor] = None):
-    """Optional reduce of aggregated logs over validation steps."""
-    return {}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/base_trainer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/base_trainer.py
deleted file mode 100644
index eb089f6099514f7a22cfab409f76609caa27fe1d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/base_trainer.py
+++ /dev/null
@@ -1,496 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Standard Trainer implementation.
-
-The base trainer implements the Orbit `StandardTrainable` and
-`StandardEvaluable` interfaces. Trainers inside this project should be
-interchangable and independent on model architectures and tasks.
-"""
-import functools
-from typing import Union, Optional
-from absl import logging
-import gin
-import orbit
-import tensorflow as tf
-
-from official.core import base_task
-from official.core import config_definitions
-from official.modeling import optimization
-
-ExperimentConfig = config_definitions.ExperimentConfig
-TrainerConfig = config_definitions.TrainerConfig
-
-
-class Recovery:
-  """Built-in model blowup recovery module.
-
-  Checks the loss value by the given threshold. If applicable, recover the
-  model by reading the checkpoint on disk.
-  """
-
-  def __init__(self,
-               loss_upper_bound: float,
-               checkpoint_manager: tf.train.CheckpointManager,
-               recovery_begin_steps: int = 0,
-               recovery_max_trials: int = 3):
-    self.recover_counter = 0
-    self.recovery_begin_steps = recovery_begin_steps
-    self.recovery_max_trials = recovery_max_trials
-    self.loss_upper_bound = loss_upper_bound
-    self.checkpoint_manager = checkpoint_manager
-
-  def should_recover(self, loss_value, global_step):
-    if tf.math.is_nan(loss_value):
-      return True
-    if (global_step >= self.recovery_begin_steps and
-        loss_value > self.loss_upper_bound):
-      return True
-    return False
-
-  def maybe_recover(self, loss_value, global_step):
-    """Conditionally recovers the training by triggering checkpoint restoration.
-
-    Args:
-      loss_value: the loss value as a float.
-      global_step: the number of global training steps.
-
-    Raises:
-      RuntimeError: when recovery happens more than the max number of trials,
-      the job should crash.
-    """
-    if not self.should_recover(loss_value, global_step):
-      return
-    self.recover_counter += 1
-    if self.recover_counter > self.recovery_max_trials:
-      raise RuntimeError(
-          "The loss value is NaN after training loop and it happens %d times." %
-          self.recover_counter)
-    # Loads the previous good checkpoint.
-    checkpoint_path = self.checkpoint_manager.restore_or_initialize()
-    logging.warning(
-        "Recovering the model from checkpoint: %s. The loss value becomes "
-        "%f at step %d.", checkpoint_path, loss_value, global_step)
-
-
-class _AsyncTrainer(orbit.StandardTrainer, orbit.StandardEvaluator):
-  """Trainer class for both sync and async Strategy."""
-
-  def init_async(self):
-    """Initializes the Async Trainer base class."""
-    assert isinstance(self._strategy, tf.distribute.Strategy)
-    self._is_async = isinstance(
-        self._strategy, tf.distribute.experimental.ParameterServerStrategy)
-    self._coordinator = None
-    if self._is_async:
-      self._coordinator = (
-          tf.distribute.experimental.coordinator.ClusterCoordinator(
-              self._strategy))
-
-  def join(self):
-    """Join all async steps. Only useful in aysnc training."""
-    if getattr(self, "_is_async", False):
-      self._coordinator.join()
-
-  def create_train_loop_fn(self):
-    """Creates a eval loop from the given step function and options."""
-    train_loop_fn = super().create_train_loop_fn()
-    if getattr(self, "_is_async", False):
-
-      def _async_loop_fn(iterator, num_steps):
-        self._coordinator.schedule(train_loop_fn, args=(iterator, num_steps))
-
-      return _async_loop_fn
-    else:
-      return train_loop_fn
-
-  def create_eval_loop_fn(self, has_state: bool):
-    """Creates a training loop from the given step function and options."""
-    eval_loop_fn = super().create_eval_loop_fn(has_state)
-
-    if getattr(self, "_is_async", False):
-      if has_state:
-        raise ValueError(
-            "Stateful eval loop is not supported in async training.")
-
-      def _async_loop_fn(iterator, num_steps, state=None, reduce_fn=None):
-        assert state is None
-        assert reduce_fn is None
-        self._coordinator.schedule(eval_loop_fn, args=(iterator, num_steps))
-
-      return _async_loop_fn
-    else:
-      return eval_loop_fn
-
-  def distribute_dataset(self, dataset_or_fn, *args, **kwargs):
-    """A utility function to help create a `tf.distribute.DistributedDataset`.
-
-    Args:
-      dataset_or_fn: A instance of `tf.data.Dataset`, or a "dataset function"
-        returning a `tf.data.Dataset`. If it is a function, it may optionally
-        have an argument named `input_context` which will be passed a
-        `tf.distribute.InputContext` instance.
-      *args: Any positional arguments to pass through to `dataset_or_fn`.
-      **kwargs: Any keyword arguments to pass through to `dataset_or_fn`.
-
-    Returns:
-      A distributed Dataset.
-    """
-    if getattr(self, "_is_async", False):
-      per_worker_dataset_fn = functools.partial(
-          orbit.utils.make_distributed_dataset, self._strategy, dataset_or_fn,
-          *args, **kwargs)
-      per_worker_dataset_fn = tf.function(per_worker_dataset_fn)
-
-      return self._coordinator.create_per_worker_dataset(per_worker_dataset_fn)
-    else:
-      return orbit.utils.make_distributed_dataset(self._strategy, dataset_or_fn,
-                                                  *args, **kwargs)
-
-
-def get_runtime_options(config: ExperimentConfig):
-  """Get tf.distribute.RunOptions from config."""
-  xla_options = {}
-  if config.runtime.tpu_enable_xla_dynamic_padder is not None:
-    xla_options["enable_xla_dynamic_padder"] = (
-        config.runtime.tpu_enable_xla_dynamic_padder)
-  return tf.distribute.RunOptions(
-      experimental_xla_options=tf.tpu.XLAOptions(**xla_options))
-
-
-@gin.configurable
-class Trainer(_AsyncTrainer):
-  """Implements the common trainer shared for TensorFlow models."""
-
-  # pylint: disable=super-init-not-called
-  def __init__(
-      self,
-      config: ExperimentConfig,
-      task: base_task.Task,
-      model: tf.keras.Model,
-      optimizer: tf.optimizers.Optimizer,
-      train: bool = True,
-      evaluate: bool = True,
-      train_dataset: Optional[Union[tf.data.Dataset,
-                                    tf.distribute.DistributedDataset]] = None,
-      validation_dataset: Optional[Union[
-          tf.data.Dataset, tf.distribute.DistributedDataset]] = None,
-      checkpoint_exporter=None):
-    """Initialize common trainer for TensorFlow models.
-
-    Args:
-      config: An `ExperimentConfig` instance specifying experiment config.
-      task: A base_task.Task instance.
-      model: The model instance, e.g. a tf.keras.Model instance.
-      optimizer: tf.optimizers.Optimizer instance.
-      train: bool, whether or not this trainer will be used for training.
-        default to True.
-      evaluate: bool, whether or not this trainer will be used for evaluation.
-        default to True.
-      train_dataset: a dataset object created for training. With tf.distribute,
-        it needs to be a `DistributedDataset`.
-      validation_dataset: a dataset object created for evaluation. With
-        tf.distribute, it needs to be a `DistributedDataset`. The evaluator will
-        create a dataset iterator for each eval round, so the dataset does not
-        need to repeat.
-      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
-        interface.
-    """
-    # Gets the current distribution strategy. If not inside any strategy scope,
-    # it gets a single-replica no-op strategy.
-    self._strategy = tf.distribute.get_strategy()
-    self._validate_params(
-        config,
-        check_train_data=train_dataset is None,
-        check_validation_data=validation_dataset is None)
-    self._config = config
-    self._task = task
-    self._model = model
-    self._optimizer = optimizer
-    self._checkpoint_exporter = checkpoint_exporter
-    self._recovery = None
-    # Runtime options are only applied to train_step.
-    # We use default for eval_step.
-    self._runtime_options = get_runtime_options(config)
-
-    # Creates a shadow copy of the weights to store weights moving average.
-    if isinstance(self._optimizer, optimization.ExponentialMovingAverage
-                 ) and not self._optimizer.has_shadow_copy:
-      self._optimizer.shadow_copy(self._model)
-
-    # global_step increases by 1 after each training iteration.
-    # We should have global_step.numpy() == self.optimizer.iterations.numpy()
-    # when there is only 1 optimizer.
-    self._global_step = orbit.utils.create_global_step()
-    if hasattr(self.model, "checkpoint_items"):
-      checkpoint_items = self.model.checkpoint_items
-    else:
-      checkpoint_items = {}
-    self._checkpoint = tf.train.Checkpoint(
-        global_step=self.global_step,
-        model=self.model,
-        optimizer=self.optimizer,
-        **checkpoint_items)
-
-    self._train_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
-    self._validation_loss = tf.keras.metrics.Mean(
-        "validation_loss", dtype=tf.float32)
-    self._train_metrics = self.task.build_metrics(
-        training=True) + self.model.metrics
-    self._validation_metrics = self.task.build_metrics(
-        training=False) + self.model.metrics
-
-    self.init_async()
-
-    if train:
-      train_dataset = train_dataset or self.distribute_dataset(
-          self.task.build_inputs, self.config.task.train_data)
-      orbit.StandardTrainer.__init__(
-          self,
-          train_dataset,
-          options=orbit.StandardTrainerOptions(
-              use_tf_while_loop=config.trainer.train_tf_while_loop,
-              use_tf_function=config.trainer.train_tf_function,
-              use_tpu_summary_optimization=config.trainer.allow_tpu_summary))
-
-    if evaluate:
-      validation_dataset = validation_dataset or self.distribute_dataset(
-          self.task.build_inputs, self.config.task.validation_data)
-      orbit.StandardEvaluator.__init__(
-          self,
-          validation_dataset,
-          options=orbit.StandardEvaluatorOptions(
-              use_tf_function=config.trainer.eval_tf_function,
-              use_tf_while_loop=config.trainer.eval_tf_while_loop))
-
-  def _validate_params(self,
-                       config,
-                       check_train_data=True,
-                       check_validation_data=True):
-    r"""Validates if the configuration object passed to the Trainer.
-
-    The experiment configuration should be structured as:
-    \trainer
-    \task
-      \train_data
-      \validation_data
-
-    Args:
-      config: a namedtuple, dataclass, ConfigDict, etc.
-      check_train_data: whether to check task.train_data field.
-      check_validation_data: whether to check task.validation_data field.
-    """
-    if not hasattr(config, "trainer"):
-      raise AttributeError("The trainer requires the configuration contains an"
-                           " attribute `trainer`.")
-
-    if not hasattr(config, "task"):
-      raise AttributeError("The trainer requires the configuration contains an"
-                           " attribute `task`.")
-
-    if check_train_data and not hasattr(config.task, "train_data"):
-      raise AttributeError("The trainer requires the configuration contains an"
-                           " attribute `task.train_data`.")
-
-    if check_validation_data and not hasattr(config.task, "validation_data"):
-      raise AttributeError("The trainer requires the configuration contains an"
-                           " attribute `task.validation_data`.")
-
-  @property
-  def strategy(self):
-    return self._strategy
-
-  @property
-  def config(self):
-    return self._config
-
-  @property
-  def task(self):
-    return self._task
-
-  @property
-  def model(self):
-    return self._model
-
-  @property
-  def optimizer(self):
-    if hasattr(self, "_optimizer"):
-      return self._optimizer
-    else:
-      return None
-
-  @property
-  def global_step(self):
-    return self._global_step
-
-  @property
-  def train_loss(self):
-    """Accesses the training loss metric object."""
-    return self._train_loss
-
-  @property
-  def validation_loss(self):
-    """Accesses the validation loss metric object."""
-    return self._validation_loss
-
-  @property
-  def train_metrics(self):
-    """Accesses all training metric objects."""
-    return self._train_metrics
-
-  @property
-  def validation_metrics(self):
-    """Accesses all validation metric metric objects."""
-    return self._validation_metrics
-
-  def initialize(self):
-    """A callback function.
-
-    This function will be called when no checkpoint found for the model.
-    If there is a checkpoint, the checkpoint will be loaded and this function
-    will not be called. Tasks may use this callback function to load a
-    pretrained checkpoint, saved under a directory other than the model_dir.
-    """
-    self.task.initialize(self.model)
-
-  @property
-  def checkpoint(self):
-    """Accesses the training checkpoint."""
-    return self._checkpoint
-
-  def add_recovery(self, params: TrainerConfig,
-                   checkpoint_manager: tf.train.CheckpointManager):
-    if params.recovery_max_trials >= 0:
-      self._recovery = Recovery(
-          loss_upper_bound=params.loss_upper_bound,
-          recovery_begin_steps=params.recovery_begin_steps,
-          recovery_max_trials=params.recovery_max_trials,
-          checkpoint_manager=checkpoint_manager)
-
-  def train_loop_end(self):
-    """See base class."""
-    self.join()
-    # Checks if the model numeric status is stable and conducts the checkpoint
-    # recovery accordingly.
-    if self._recovery:
-      self._recovery.maybe_recover(self.train_loss.result().numpy(),
-                                   self.global_step.numpy())
-    logs = {}
-    for metric in self.train_metrics + [self.train_loss]:
-      logs[metric.name] = metric.result()
-      metric.reset_states()
-    if callable(self.optimizer.learning_rate):
-      # Maybe a self-implemented optimizer does not have `optimizer.iterations`.
-      # So just to be safe here.
-      if hasattr(self.optimizer, "iterations"):
-        logs["learning_rate"] = self.optimizer.learning_rate(
-            self.optimizer.iterations)
-      else:
-        logs["learning_rate"] = self.optimizer.learning_rate(self.global_step)
-    else:
-      logs["learning_rate"] = self.optimizer.learning_rate
-    return logs
-
-  def train_step(self, iterator):
-    """See base class."""
-
-    def step_fn(inputs):
-      if self.config.runtime.enable_xla and (self.config.runtime.num_gpus > 0):
-        task_train_step = tf.function(self.task.train_step, jit_compile=True)
-      else:
-        task_train_step = self.task.train_step
-      logs = task_train_step(
-          inputs,
-          model=self.model,
-          optimizer=self.optimizer,
-          metrics=self.train_metrics)
-      self._train_loss.update_state(logs[self.task.loss])
-      self.global_step.assign_add(1)
-
-    self.strategy.run(
-        step_fn, args=(next(iterator),), options=self._runtime_options)
-
-  def eval_begin(self):
-    """Sets up metrics."""
-    for metric in self.validation_metrics + [self.validation_loss]:
-      metric.reset_states()
-    # Swaps weights to test on weights moving average.
-    if self.optimizer and isinstance(self.optimizer,
-                                     optimization.ExponentialMovingAverage):
-      self.optimizer.swap_weights()
-
-  def eval_step(self, iterator):
-    """See base class."""
-
-    def step_fn(inputs):
-      logs = self.task.validation_step(
-          inputs, model=self.model, metrics=self.validation_metrics)
-      if self.task.loss in logs:
-        self._validation_loss.update_state(logs[self.task.loss])
-      return logs
-
-    distributed_outputs = self.strategy.run(step_fn, args=(next(iterator),))
-    return tf.nest.map_structure(self.strategy.experimental_local_results,
-                                 distributed_outputs)
-
-  def eval_end(self, aggregated_logs=None):
-    """Processes evaluation results."""
-    self.join()
-    logs = {}
-    for metric in self.validation_metrics:
-      logs[metric.name] = metric.result()
-    if self.validation_loss.count.numpy() != 0:
-      logs[self.validation_loss.name] = self.validation_loss.result()
-    else:
-      # `self.validation_loss` metric was not updated, because the validation
-      # loss was not returned from the task's `validation_step` method.
-      logging.info("The task did not report validation loss.")
-    if aggregated_logs:
-      metrics = self.task.reduce_aggregated_logs(
-          aggregated_logs, global_step=self.global_step)
-      logs.update(metrics)
-
-    if self._checkpoint_exporter:
-      self._checkpoint_exporter.maybe_export_checkpoint(
-          self.checkpoint, logs, self.global_step.numpy())
-      metric_name = self.config.trainer.best_checkpoint_eval_metric
-      logs["best_" +
-           metric_name] = self._checkpoint_exporter.best_ckpt_logs[metric_name]
-
-    # Swaps back weights after testing when EMA is used.
-    # This happens after best checkpoint export so that average weights used for
-    # eval are exported instead of regular weights.
-    if self.optimizer and isinstance(self.optimizer,
-                                     optimization.ExponentialMovingAverage):
-      self.optimizer.swap_weights()
-    return logs
-
-  def eval_reduce(self, state=None, step_outputs=None):
-    return self.task.aggregate_logs(state, step_outputs)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/base_trainer_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/base_trainer_test.py
deleted file mode 100644
index cb938879299ec7457fec51e50051827d05840bc8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/base_trainer_test.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for tensorflow_models.core.trainers.trainer."""
-# pylint: disable=g-direct-tensorflow-import
-import multiprocessing
-import os
-import sys
-
-from absl.testing import parameterized
-import numpy as np
-import orbit
-import portpicker
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.core import base_trainer as trainer_lib
-from official.core import config_definitions as cfg
-from official.core import train_lib
-from official.utils.testing import mock_task
-
-TPU_TEST = 'test_tpu' in sys.argv[0]
-GPU_TEST = 'test_gpu' in sys.argv[0]
-
-
-def all_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.default_strategy,
-          strategy_combinations.cloud_tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-      ],)
-
-
-def create_in_process_cluster(num_workers, num_ps):
-  """Creates and starts local servers and returns the cluster_resolver."""
-  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
-  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
-
-  cluster_dict = {}
-  cluster_dict['worker'] = ['localhost:%s' % port for port in worker_ports]
-  if num_ps > 0:
-    cluster_dict['ps'] = ['localhost:%s' % port for port in ps_ports]
-
-  cluster_spec = tf.train.ClusterSpec(cluster_dict)
-
-  # Workers need some inter_ops threads to work properly.
-  worker_config = tf.compat.v1.ConfigProto()
-  if multiprocessing.cpu_count() < num_workers + 1:
-    worker_config.inter_op_parallelism_threads = num_workers + 1
-
-  for i in range(num_workers):
-    tf.distribute.Server(
-        cluster_spec,
-        job_name='worker',
-        task_index=i,
-        config=worker_config,
-        protocol='grpc')
-
-  for i in range(num_ps):
-    tf.distribute.Server(
-        cluster_spec, job_name='ps', task_index=i, protocol='grpc')
-
-  cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
-      cluster_spec, rpc_layer='grpc')
-  return cluster_resolver
-
-
-def dataset_fn(input_context=None):
-  del input_context
-
-  def dummy_data(_):
-    return tf.zeros((1, 1), dtype=tf.float32)
-
-  dataset = tf.data.Dataset.range(1)
-  dataset = dataset.repeat()
-  dataset = dataset.map(
-      dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  return dataset
-
-
-class MockAsyncTrainer(trainer_lib._AsyncTrainer):
-  """Mock AsyncTrainer to test the _AsyncTrainer class."""
-
-  def __init__(self):
-    self._strategy = tf.distribute.get_strategy()
-    self.init_async()
-
-    self.global_step = tf.Variable(
-        0,
-        dtype=tf.int64,
-        name='global_step',
-        trainable=False,
-        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
-    self.eval_global_step = tf.Variable(
-        0,
-        dtype=tf.int64,
-        name='eval_global_step',
-        trainable=False,
-        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
-
-    train_dataset = self.distribute_dataset(dataset_fn)
-    orbit.StandardTrainer.__init__(
-        self, train_dataset, options=orbit.StandardTrainerOptions())
-
-    validation_dataset = self.distribute_dataset(dataset_fn)
-    orbit.StandardEvaluator.__init__(
-        self,
-        validation_dataset,
-        options=orbit.StandardEvaluatorOptions(use_tf_while_loop=True))
-
-  def train_loop_begin(self):
-    self.global_step.assign(0)
-
-  def train_step(self, iterator):
-
-    def replica_step(_):
-      self.global_step.assign_add(1)
-
-    self._strategy.run(replica_step, args=(next(iterator),))
-
-  def train_loop_end(self):
-    self.join()
-    return self.global_step.numpy()
-
-  def eval_begin(self):
-    self.eval_global_step.assign(0)
-
-  def eval_step(self, iterator):
-
-    def replica_step(_):
-      self.eval_global_step.assign_add(1)
-
-    self._strategy.run(replica_step, args=(next(iterator),))
-
-  def eval_end(self):
-    self.join()
-    return self.eval_global_step.numpy()
-
-
-class TrainerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self._config = cfg.ExperimentConfig(
-        trainer=cfg.TrainerConfig(
-            optimizer_config=cfg.OptimizationConfig({
-                'optimizer': {
-                    'type': 'sgd'
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                }
-            })))
-
-  def create_test_trainer(self, config, model_dir=None, task=None):
-    task = task or mock_task.MockTask(config.task, logging_dir=model_dir)
-    ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir)
-    trainer = trainer_lib.Trainer(
-        config,
-        task,
-        model=task.build_model(),
-        optimizer=task.create_optimizer(config.trainer.optimizer_config,
-                                        config.runtime),
-        checkpoint_exporter=ckpt_exporter)
-    return trainer
-
-  @combinations.generate(all_strategy_combinations())
-  def test_trainer_train(self, distribution):
-    with distribution.scope():
-      trainer = self.create_test_trainer(self._config)
-      logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertIn('training_loss', logs)
-      self.assertIn('learning_rate', logs)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_trainer_passing_datasets(self, distribution):
-    with distribution.scope():
-      task = mock_task.MockTask(self._config)
-      train_dataset = orbit.utils.make_distributed_dataset(
-          distribution, task.build_inputs, self._config.task.train_data)
-      validation_dataset = orbit.utils.make_distributed_dataset(
-          distribution, task.build_inputs, self._config.task.validation_data)
-      self._config.task.train_data = None
-      self._config.task.validation_data = None
-      trainer = trainer_lib.Trainer(
-          self._config,
-          task,
-          model=task.build_model(),
-          optimizer=task.create_optimizer(self._config.trainer.optimizer_config,
-                                          self._config.runtime),
-          train_dataset=train_dataset,
-          validation_dataset=validation_dataset)
-    logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
-    self.assertIn('training_loss', logs)
-    self.assertIn('learning_rate', logs)
-    logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
-    self.assertIn('validation_loss', logs)
-
-  def test_base_async_trainer(self):
-    if TPU_TEST or GPU_TEST:
-      self.skipTest('Aysnc training is not available on GPU/GPU.')
-    num_workers = 3
-    num_ps = 2
-    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
-    distribution = tf.distribute.experimental.ParameterServerStrategy(
-        cluster_resolver)
-    with distribution.scope():
-      trainer = MockAsyncTrainer()
-      trainer.init_async()
-      self.assertIsInstance(
-          trainer._coordinator,
-          tf.distribute.experimental.coordinator.ClusterCoordinator)
-      self.assertEqual(trainer.train(tf.constant(10)), 10)
-      self.assertEqual(trainer.evaluate(tf.constant(11)), 11)
-
-  def test_async_trainer_train(self):
-    if TPU_TEST or GPU_TEST:
-      self.skipTest('Aysnc training is not available on GPU/TPU.')
-    num_workers = 3
-    num_ps = 2
-    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
-    distribution = tf.distribute.experimental.ParameterServerStrategy(
-        cluster_resolver)
-    with distribution.scope():
-      config = cfg.ExperimentConfig(**self._config.as_dict())
-      config.trainer.eval_tf_while_loop = True
-      trainer = self.create_test_trainer(config)
-      logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertIn('training_loss', logs)
-      self.assertIn('learning_rate', logs)
-
-  def test_async_trainer_validate(self):
-    if TPU_TEST or GPU_TEST:
-      self.skipTest('Aysnc training is not available on GPU/GPU.')
-    num_workers = 3
-    num_ps = 2
-    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
-    distribution = tf.distribute.experimental.ParameterServerStrategy(
-        cluster_resolver)
-    with distribution.scope():
-      config = cfg.ExperimentConfig(**self._config.as_dict())
-      config.trainer.eval_tf_while_loop = True
-      trainer = self.create_test_trainer(config)
-      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertIn('acc', logs)
-      self.assertIn('validation_loss', logs)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_trainer_validate(self, distribution):
-    with distribution.scope():
-      trainer = self.create_test_trainer(self._config)
-      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
-      self.assertIn('validation_loss', logs)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_trainer_validate_without_loss(self, distribution):
-
-    class MockTaskWithoutValidationLoss(mock_task.MockTask):
-
-      def validation_step(self, inputs, model, metrics=None):
-        # Disable validation loss.
-        logs = super().validation_step(inputs, model)
-        del logs[self.loss]
-        return logs
-
-    with distribution.scope():
-      task = MockTaskWithoutValidationLoss()
-      trainer = self.create_test_trainer(self._config, task=task)
-      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
-      self.assertNotIn('validation_loss', logs)
-
-  @combinations.generate(
-      combinations.combine(
-          mixed_precision_dtype=['float32', 'bfloat16', 'float16'],
-          loss_scale=[None, 'dynamic', 128, 256],
-      ))
-  def test_configure_optimizer(self, mixed_precision_dtype, loss_scale):
-    config = cfg.ExperimentConfig(
-        runtime=cfg.RuntimeConfig(
-            mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale),
-        trainer=cfg.TrainerConfig(
-            optimizer_config=cfg.OptimizationConfig({
-                'optimizer': {
-                    'type': 'sgd'
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                },
-            })))
-    trainer = self.create_test_trainer(config)
-    if mixed_precision_dtype != 'float16':
-      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
-    elif mixed_precision_dtype == 'float16' and loss_scale is None:
-      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
-    else:
-      self.assertIsInstance(trainer.optimizer,
-                            tf.keras.mixed_precision.LossScaleOptimizer)
-
-    metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
-    self.assertIn('training_loss', metrics)
-
-  def test_export_best_ckpt(self):
-    config = cfg.ExperimentConfig(
-        trainer=cfg.TrainerConfig(
-            best_checkpoint_export_subdir='best_ckpt',
-            best_checkpoint_eval_metric='acc',
-            optimizer_config=cfg.OptimizationConfig({
-                'optimizer': {
-                    'type': 'sgd'
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                }
-            })))
-    model_dir = self.get_temp_dir()
-    trainer = self.create_test_trainer(config, model_dir=model_dir)
-    trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
-    trainer.evaluate(tf.convert_to_tensor(1, dtype=tf.int32))
-    self.assertTrue(
-        tf.io.gfile.exists(os.path.join(model_dir, 'best_ckpt', 'info.json')))
-
-  def test_recovery(self):
-    config = cfg.ExperimentConfig(
-        trainer=cfg.TrainerConfig(
-            loss_upper_bound=0.5,
-            recovery_max_trials=2,
-            optimizer_config=cfg.OptimizationConfig({
-                'optimizer': {
-                    'type': 'sgd'
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                }
-            })))
-    model_dir = self.get_temp_dir()
-    trainer = self.create_test_trainer(config, model_dir=model_dir)
-    checkpoint_manager = tf.train.CheckpointManager(
-        trainer.checkpoint, self.get_temp_dir(), max_to_keep=2)
-    checkpoint_manager.save()
-    trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager)
-    before_weights = trainer.model.get_weights()
-    _ = trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
-    # The training loss is 1.0 and upper_bound is 0.5, so the recover happens.
-    after_weights = trainer.model.get_weights()
-    for left, right in zip(before_weights, after_weights):
-      self.assertAllEqual(left, right)
-
-    # Let's the loss be NaN and max_trials = 0 to see RuntimeError.
-    config = cfg.ExperimentConfig(
-        trainer=cfg.TrainerConfig(
-            recovery_max_trials=0,
-            optimizer_config=cfg.OptimizationConfig({
-                'optimizer': {
-                    'type': 'sgd'
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                }
-            })))
-    task = mock_task.MockTask(config.task, logging_dir=model_dir)
-
-    def build_losses(labels, model_outputs, aux_losses=None):
-      del labels, model_outputs
-      return tf.constant([np.nan], tf.float32) + aux_losses
-
-    task.build_losses = build_losses
-    trainer = trainer_lib.Trainer(
-        config,
-        task,
-        model=task.build_model(),
-        optimizer=task.create_optimizer(config.trainer.optimizer_config,
-                                        config.runtime))
-    trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager)
-    with self.assertRaises(RuntimeError):
-      _ = trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
-
-  def test_model_with_compiled_loss(self):
-    task = mock_task.MockTask()
-    model = task.build_model()
-    model.compile(loss=tf.keras.losses.CategoricalCrossentropy())
-    trainer = trainer_lib.Trainer(
-        self._config,
-        task,
-        model=model,
-        optimizer=task.create_optimizer(self._config.trainer.optimizer_config))
-    logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
-    self.assertIn('training_loss', logs)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/config_definitions.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/config_definitions.py
deleted file mode 100644
index 498ecea4b9b4148ee8d6465ec26ed155929b5686..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/config_definitions.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Common configuration settings."""
-
-from typing import Optional, Sequence, Union
-
-import dataclasses
-
-from official.modeling.hyperparams import base_config
-from official.modeling.optimization.configs import optimization_config
-
-OptimizationConfig = optimization_config.OptimizationConfig
-
-
-@dataclasses.dataclass
-class DataConfig(base_config.Config):
-  """The base configuration for building datasets.
-
-  Attributes:
-    input_path: The path to the input. It can be either (1) a str indicating
-      a file path/pattern, or (2) a str indicating multiple file paths/patterns
-      separated by comma (e.g "a, b, c" or no spaces "a,b,c"), or
-      (3) a list of str, each of which is a file path/pattern or multiple file
-      paths/patterns separated by comma.
-      It should not be specified when the following `tfds_name` is specified.
-    tfds_name: The name of the tensorflow dataset (TFDS). It should not be
-      specified when the above `input_path` is specified.
-    tfds_split: A str indicating which split of the data to load from TFDS. It
-      is required when above `tfds_name` is specified.
-    global_batch_size: The global batch size across all replicas.
-    is_training: Whether this data is used for training or not.
-    drop_remainder: Whether the last batch should be dropped in the case it has
-      fewer than `global_batch_size` elements.
-    shuffle_buffer_size: The buffer size used for shuffling training data.
-    cache: Whether to cache dataset examples. If `True`, we will cache the
-      dataset after applying the decode_fn and parse_fn. It can be used to avoid
-      re-reading from disk, re-decoding and re-parsing the example on the
-      second epoch, but it requires significant memory overhead.
-    cycle_length: The number of files that will be processed concurrently when
-      interleaving files.
-    block_length: The number of consecutive elements to produce from each input
-      element before cycling to another input element when interleaving files.
-    deterministic: A boolean controlling whether determinism should be enforced.
-    sharding: Whether sharding is used in the input pipeline.
-    enable_tf_data_service: A boolean indicating whether to enable tf.data
-      service for the input pipeline.
-    tf_data_service_address: The URI of a tf.data service to offload
-      preprocessing onto during training. The URI should be in the format
-      "protocol://address", e.g. "grpc://tf-data-service:5050". It can be
-      overridden by `FLAGS.tf_data_service` flag in the binary.
-    tf_data_service_job_name: The name of the tf.data service job. This
-      argument makes it possible for multiple datasets to share the same job.
-      The default behavior is that the dataset creates anonymous, exclusively
-      owned jobs.
-    tfds_data_dir: A str specifying the directory to read/write TFDS data.
-    tfds_as_supervised: A bool. When loading dataset from TFDS, if True, the
-      returned tf.data.Dataset will have a 2-tuple structure (input, label)
-      according to builder.info.supervised_keys; if False, the default, the
-      returned tf.data.Dataset will have a dictionary with all the features.
-    tfds_skip_decoding_feature: A str to indicate which features are skipped for
-      decoding when loading dataset from TFDS. Use comma to separate multiple
-      features. The main use case is to skip the image/video decoding for better
-      performance.
-    seed: An optional seed to use for deterministic shuffling/preprocessing.
-  """
-  input_path: Union[Sequence[str], str] = ""
-  tfds_name: str = ""
-  tfds_split: str = ""
-  global_batch_size: int = 0
-  is_training: bool = None
-  drop_remainder: bool = True
-  shuffle_buffer_size: int = 100
-  cache: bool = False
-  cycle_length: Optional[int] = None
-  block_length: int = 1
-  deterministic: Optional[bool] = None
-  sharding: bool = True
-  enable_tf_data_service: bool = False
-  tf_data_service_address: Optional[str] = None
-  tf_data_service_job_name: Optional[str] = None
-  tfds_data_dir: str = ""
-  tfds_as_supervised: bool = False
-  tfds_skip_decoding_feature: str = ""
-  seed: Optional[int] = None
-
-
-@dataclasses.dataclass
-class RuntimeConfig(base_config.Config):
-  """High-level configurations for Runtime.
-
-  These include parameters that are not directly related to the experiment,
-  e.g. directories, accelerator type, etc.
-
-  Attributes:
-    distribution_strategy: e.g. 'mirrored', 'tpu', etc.
-    enable_xla: Whether or not to enable XLA.
-    per_gpu_thread_count: thread count per GPU.
-    gpu_thread_mode: Whether and how the GPU device uses its own threadpool.
-    dataset_num_private_threads: Number of threads for a private threadpool
-      created for all datasets computation.
-    tpu: The address of the TPU to use, if any.
-    num_gpus: The number of GPUs to use, if any.
-    worker_hosts: comma-separated list of worker ip:port pairs for running
-      multi-worker models with DistributionStrategy.
-    task_index: If multi-worker training, the task index of this worker.
-    all_reduce_alg: Defines the algorithm for performing all-reduce.
-    num_packs: Sets `num_packs` in the cross device ops used in
-      MirroredStrategy.  For details, see tf.distribute.NcclAllReduce.
-    mixed_precision_dtype: dtype of mixed precision policy. It can be 'float32',
-      'float16', or 'bfloat16'.
-    loss_scale: The type of loss scale, or 'float' value. This is used when
-      setting the mixed precision policy.
-    run_eagerly: Whether or not to run the experiment eagerly.
-    batchnorm_spatial_persistent: Whether or not to enable the spatial
-      persistent mode for CuDNN batch norm kernel for improved GPU performance.
-  """
-  distribution_strategy: str = "mirrored"
-  enable_xla: bool = False
-  gpu_thread_mode: Optional[str] = None
-  dataset_num_private_threads: Optional[int] = None
-  per_gpu_thread_count: int = 0
-  tpu: Optional[str] = None
-  num_gpus: int = 0
-  worker_hosts: Optional[str] = None
-  task_index: int = -1
-  all_reduce_alg: Optional[str] = None
-  num_packs: int = 1
-  mixed_precision_dtype: Optional[str] = None
-  loss_scale: Optional[Union[str, float]] = None
-  run_eagerly: bool = False
-  batchnorm_spatial_persistent: bool = False
-
-  # XLA runtime params.
-  # XLA params are only applied to the train_step.
-  # These augments can improve training speed. They can also improve eval, but
-  # may reduce usability and users would need to make changes to code.
-
-  # Whether to enable XLA dynamic padder
-  # infrastructure to handle dynamic shapes inputs inside XLA. True by
-  # default. Disabling this may cause correctness issues with dynamic shapes
-  # inputs, as XLA will just assume the inputs are with padded shapes. However
-  # users can optionally set it to False to improve device time if masking is
-  # already handled in the user side.
-  # If None, will respect XLA default.
-  tpu_enable_xla_dynamic_padder: Optional[bool] = None
-
-  # Global model parallelism configurations.
-  num_cores_per_replica: int = 1
-  default_shard_dim: int = -1
-
-  def model_parallelism(self):
-    return dict(
-        num_cores_per_replica=self.num_cores_per_replica,
-        default_shard_dim=self.default_shard_dim)
-
-
-@dataclasses.dataclass
-class TrainerConfig(base_config.Config):
-  """Configuration for trainer.
-
-  Attributes:
-    optimizer_config: optimizer config, it includes optimizer, learning rate,
-      and warmup schedule configs.
-    train_tf_while_loop: whether or not to use tf while loop.
-    train_tf_function: whether or not to use tf_function for training loop.
-    eval_tf_function: whether or not to use tf_function for eval.
-    allow_tpu_summary: Whether to allow summary happen inside the XLA program
-      runs on TPU through automatic outside compilation.
-    steps_per_loop: number of steps per loop.
-    summary_interval: number of steps between each summary.
-    checkpoint_interval: number of steps between checkpoints.
-    max_to_keep: max checkpoints to keep.
-    continuous_eval_timeout: maximum number of seconds to wait between
-      checkpoints, if set to None, continuous eval will wait indefinitely. This
-      is only used continuous_train_and_eval and continuous_eval modes. Default
-      value is 1 hrs.
-    train_steps: number of train steps.
-    validation_steps: number of eval steps. If `None`, the entire eval dataset
-      is used.
-    validation_interval: number of training steps to run between evaluations.
-    best_checkpoint_export_subdir: if set, the trainer will keep track of the
-      best evaluation metric, and export the corresponding best checkpoint under
-      `model_dir/best_checkpoint_export_subdir`. Note that this only works if
-      mode contains eval (such as `train_and_eval`, `continuous_eval`, and
-      `continuous_train_and_eval`).
-    best_checkpoint_eval_metric: for exporting the best checkpoint, which
-      evaluation metric the trainer should monitor. This can be any evaluation
-      metric appears on tensorboard.
-    best_checkpoint_metric_comp: for exporting the best checkpoint, how the
-      trainer should compare the evaluation metrics. This can be either `higher`
-      (higher the better) or `lower` (lower the better).
-    validation_summary_subdir: A 'str', sub directory for saving eval summary.
-  """
-  optimizer_config: OptimizationConfig = OptimizationConfig()
-  # Orbit settings.
-  train_tf_while_loop: bool = True
-  train_tf_function: bool = True
-  eval_tf_function: bool = True
-  eval_tf_while_loop: bool = False
-  allow_tpu_summary: bool = False
-  # Trainer intervals.
-  steps_per_loop: int = 1000
-  summary_interval: int = 1000
-  checkpoint_interval: int = 1000
-  # Checkpoint manager.
-  max_to_keep: int = 5
-  continuous_eval_timeout: int = 60 * 60
-  # Train/Eval routines.
-  train_steps: int = 0
-  # Sets validation steps to be -1 to evaluate the entire dataset.
-  validation_steps: int = -1
-  validation_interval: int = 1000
-  # Best checkpoint export.
-  best_checkpoint_export_subdir: str = ""
-  best_checkpoint_eval_metric: str = ""
-  best_checkpoint_metric_comp: str = "higher"
-  # Blowup recovery.
-  loss_upper_bound: float = 1e6
-  recovery_begin_steps: int = 0  # Enforcing the loss bound after these steps.
-  # When max trials < 0, no recovery module; max trials = 0, we will check
-  # the condition and fail the job if the condition happens; max trials > 0,
-  # we will retore the model states.
-  recovery_max_trials: int = 0
-  validation_summary_subdir: str = "validation"
-
-
-@dataclasses.dataclass
-class TaskConfig(base_config.Config):
-  init_checkpoint: str = ""
-  model: base_config.Config = None
-  train_data: DataConfig = DataConfig()
-  validation_data: DataConfig = DataConfig()
-
-
-@dataclasses.dataclass
-class ExperimentConfig(base_config.Config):
-  """Top-level configuration."""
-  task: TaskConfig = TaskConfig()
-  trainer: TrainerConfig = TrainerConfig()
-  runtime: RuntimeConfig = RuntimeConfig()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/exp_factory.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/exp_factory.py
deleted file mode 100644
index e9dbe0972d4ecf4e1e5b3e94142d4b633070a38a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/exp_factory.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Experiment factory methods."""
-
-from official.core import config_definitions as cfg
-from official.core import registry
-
-
-_REGISTERED_CONFIGS = {}
-
-
-def register_config_factory(name):
-  """Register ExperimentConfig factory method."""
-  return registry.register(_REGISTERED_CONFIGS, name)
-
-
-def get_exp_config_creater(exp_name: str):
-  """Looks up ExperimentConfig factory methods."""
-  exp_creater = registry.lookup(_REGISTERED_CONFIGS, exp_name)
-  return exp_creater
-
-
-def get_exp_config(exp_name: str) -> cfg.ExperimentConfig:
-  return get_exp_config_creater(exp_name)()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/export_base.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/export_base.py
deleted file mode 100644
index b8529a2b73991b15cc700a0fc30486a82a48665a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/export_base.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Base class for model export."""
-
-import abc
-import functools
-from typing import Any, Callable, Dict, Mapping, List, Optional, Text, Union
-
-import tensorflow as tf
-from tensorflow.python.saved_model.model_utils import export_utils
-
-
-class ExportModule(tf.Module, metaclass=abc.ABCMeta):
-  """Base Export Module."""
-
-  def __init__(self,
-               params,
-               model: Union[tf.Module, tf.keras.Model],
-               inference_step: Optional[Callable[..., Any]] = None):
-    """Instantiates an ExportModel.
-
-    Args:
-      params: A dataclass for parameters to the module.
-      model: A model instance which contains weights and forward computation.
-      inference_step: An optional callable to define how the model is called.
-    """
-    super().__init__(name=None)
-    self.model = model
-    self.params = params
-
-    if inference_step is not None:
-      self.inference_step = functools.partial(inference_step, model=self.model)
-    else:
-      self.inference_step = functools.partial(
-          self.model.__call__, training=False)
-
-  @abc.abstractmethod
-  def serve(self) -> Mapping[Text, tf.Tensor]:
-    """The bare inference function which should run on all devices.
-
-    Expecting tensors are passed in through keyword arguments. Returns a
-    dictionary of tensors, when the keys will be used inside the SignatureDef.
-    """
-
-  @abc.abstractmethod
-  def get_inference_signatures(
-      self, function_keys: Dict[Text, Text]) -> Mapping[Text, Any]:
-    """Get defined function signatures."""
-
-
-def export(export_module: ExportModule,
-           function_keys: Union[List[Text], Dict[Text, Text]],
-           export_savedmodel_dir: Text,
-           checkpoint_path: Optional[Text] = None,
-           timestamped: bool = True,
-           save_options: Optional[tf.saved_model.SaveOptions] = None) -> Text:
-  """Exports to SavedModel format.
-
-  Args:
-    export_module: a ExportModule with the keras Model and serving tf.functions.
-    function_keys: a list of string keys to retrieve pre-defined serving
-      signatures. The signaute keys will be set with defaults. If a dictionary
-      is provided, the values will be used as signature keys.
-    export_savedmodel_dir: Output saved model directory.
-    checkpoint_path: Object-based checkpoint path or directory.
-    timestamped: Whether to export the savedmodel to a timestamped directory.
-    save_options: `SaveOptions` for `tf.saved_model.save`.
-
-  Returns:
-    The savedmodel directory path.
-  """
-  ckpt_dir_or_file = checkpoint_path
-  if tf.io.gfile.isdir(ckpt_dir_or_file):
-    ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
-  if ckpt_dir_or_file:
-    checkpoint = tf.train.Checkpoint(model=export_module.model)
-    checkpoint.read(
-        ckpt_dir_or_file).assert_existing_objects_matched().expect_partial()
-  if isinstance(function_keys, list):
-    if len(function_keys) == 1:
-      function_keys = {
-          function_keys[0]: tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-      }
-    else:
-      raise ValueError(
-          "If the function_keys is a list, it must contain a single element. %s"
-          % function_keys)
-
-  signatures = export_module.get_inference_signatures(function_keys)
-  if timestamped:
-    export_dir = export_utils.get_timestamped_export_dir(
-        export_savedmodel_dir).decode("utf-8")
-  else:
-    export_dir = export_savedmodel_dir
-  tf.saved_model.save(
-      export_module, export_dir, signatures=signatures, options=save_options)
-  return export_dir
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/export_base_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/export_base_test.py
deleted file mode 100644
index c6838697b9e2c0a2b9ff5a9cd587446df807321e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/export_base_test.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.core.export_base."""
-import os
-from typing import Any, Dict, Mapping, Text
-
-import tensorflow as tf
-
-from official.core import export_base
-
-
-class TestModule(export_base.ExportModule):
-
-  @tf.function
-  def serve(self, inputs: tf.Tensor) -> Mapping[Text, tf.Tensor]:
-    return {'outputs': self.inference_step(inputs)}
-
-  def get_inference_signatures(
-      self, function_keys: Dict[Text, Text]) -> Mapping[Text, Any]:
-    input_signature = tf.TensorSpec(shape=[None, None], dtype=tf.float32)
-    return {'foo': self.serve.get_concrete_function(input_signature)}
-
-
-class ExportBaseTest(tf.test.TestCase):
-
-  def test_export_module(self):
-    tmp_dir = self.get_temp_dir()
-    model = tf.keras.layers.Dense(2)
-    inputs = tf.ones([2, 4], tf.float32)
-    expected_output = model(inputs, training=False)
-    module = TestModule(params=None, model=model)
-    ckpt_path = tf.train.Checkpoint(model=model).save(
-        os.path.join(tmp_dir, 'ckpt'))
-    export_dir = export_base.export(
-        module, ['foo'],
-        export_savedmodel_dir=tmp_dir,
-        checkpoint_path=ckpt_path,
-        timestamped=True)
-    self.assertTrue(os.path.exists(os.path.join(export_dir, 'saved_model.pb')))
-    self.assertTrue(
-        os.path.exists(
-            os.path.join(export_dir, 'variables', 'variables.index')))
-    self.assertTrue(
-        os.path.exists(
-            os.path.join(export_dir, 'variables',
-                         'variables.data-00000-of-00001')))
-
-    imported = tf.saved_model.load(export_dir)
-    output = imported.signatures['foo'](inputs)
-    self.assertAllClose(output['outputs'].numpy(), expected_output.numpy())
-
-  def test_custom_inference_step(self):
-    tmp_dir = self.get_temp_dir()
-    model = tf.keras.layers.Dense(2)
-    inputs = tf.ones([2, 4], tf.float32)
-
-    def _inference_step(inputs, model):
-      return tf.nn.softmax(model(inputs, training=False))
-
-    module = TestModule(
-        params=None, model=model, inference_step=_inference_step)
-    expected_output = _inference_step(inputs, model)
-    ckpt_path = tf.train.Checkpoint(model=model).save(
-        os.path.join(tmp_dir, 'ckpt'))
-    export_dir = export_base.export(
-        module, ['foo'],
-        export_savedmodel_dir=tmp_dir,
-        checkpoint_path=ckpt_path,
-        timestamped=False)
-    imported = tf.saved_model.load(export_dir)
-    output = imported.signatures['foo'](inputs)
-    self.assertAllClose(output['outputs'].numpy(), expected_output.numpy())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/input_reader.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/input_reader.py
deleted file mode 100644
index f6ce85e5347bc893cf668e1e3cf5844ccd800adb..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/input_reader.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A common dataset reader."""
-import random
-from typing import Any, Callable, List, Optional
-
-from absl import logging
-import tensorflow as tf
-import tensorflow_datasets as tfds
-
-from official.core import config_definitions as cfg
-
-
-def _get_random_integer():
-  return random.randint(0, (1 << 31) - 1)
-
-
-def _maybe_map_fn(dataset: tf.data.Dataset,
-                  fn: Optional[Callable[..., Any]] = None) -> tf.data.Dataset:
-  """Calls dataset.map if a valid function is passed in."""
-  return dataset if fn is None else dataset.map(
-      fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-
-class InputReader:
-  """Input reader that returns a tf.data.Dataset instance."""
-
-  # A static random number which is the same across different InputReader
-  # instances.
-  static_randnum = _get_random_integer()
-
-  def __init__(self,
-               params: cfg.DataConfig,
-               dataset_fn=tf.data.TFRecordDataset,
-               decoder_fn: Optional[Callable[..., Any]] = None,
-               sample_fn: Optional[Callable[..., Any]] = None,
-               parser_fn: Optional[Callable[..., Any]] = None,
-               transform_and_batch_fn: Optional[Callable[
-                   [tf.data.Dataset, Optional[tf.distribute.InputContext]],
-                   tf.data.Dataset]] = None,
-               postprocess_fn: Optional[Callable[..., Any]] = None):
-    """Initializes an InputReader instance.
-
-    Args:
-      params: A config_definitions.DataConfig object.
-      dataset_fn: A `tf.data.Dataset` that consumes the input files. For
-        example, it can be `tf.data.TFRecordDataset`.
-      decoder_fn: An optional `callable` that takes the serialized data string
-        and decodes them into the raw tensor dictionary.
-      sample_fn: An optional `callable` that takes a `tf.data.Dataset` object as
-        input and outputs the transformed dataset. It performs sampling on the
-        decoded raw tensors dict before the parser_fn.
-      parser_fn: An optional `callable` that takes the decoded raw tensors dict
-        and parse them into a dictionary of tensors that can be consumed by the
-        model. It will be executed after decoder_fn.
-      transform_and_batch_fn: An optional `callable` that takes a
-        `tf.data.Dataset` object and an optional `tf.distribute.InputContext` as
-        input, and returns a `tf.data.Dataset` object. It will be executed after
-        `parser_fn` to transform and batch the dataset; if None, after
-        `parser_fn` is executed, the dataset will be batched into per-replica
-        batch size.
-      postprocess_fn: A optional `callable` that processes batched tensors. It
-        will be executed after batching.
-    """
-    if params.input_path and params.tfds_name:
-      raise ValueError('At most one of `input_path` and `tfds_name` can be '
-                       'specified, but got %s and %s.' %
-                       (params.input_path, params.tfds_name))
-    self._tfds_builder = None
-    self._matched_files = []
-    if params.input_path:
-      self._matched_files = self._match_files(params.input_path)
-    else:
-      # Read dataset from TFDS.
-      if not params.tfds_split:
-        raise ValueError(
-            '`tfds_name` is %s, but `tfds_split` is not specified.' %
-            params.tfds_name)
-      self._tfds_builder = tfds.builder(
-          params.tfds_name, data_dir=params.tfds_data_dir)
-
-    self._global_batch_size = params.global_batch_size
-    self._is_training = params.is_training
-    self._drop_remainder = params.drop_remainder
-    self._shuffle_buffer_size = params.shuffle_buffer_size
-    self._cache = params.cache
-    self._cycle_length = params.cycle_length
-    self._block_length = params.block_length
-    self._deterministic = params.deterministic
-    self._sharding = params.sharding
-    self._tfds_split = params.tfds_split
-    self._tfds_as_supervised = params.tfds_as_supervised
-    self._tfds_skip_decoding_feature = params.tfds_skip_decoding_feature
-
-    self._dataset_fn = dataset_fn
-    self._decoder_fn = decoder_fn
-    self._sample_fn = sample_fn
-    self._parser_fn = parser_fn
-    self._transform_and_batch_fn = transform_and_batch_fn
-    self._postprocess_fn = postprocess_fn
-    # When tf.data service is enabled, each data service worker should get
-    # different random seeds. Thus, we set `seed` to None.
-    if params.seed is not None:
-      self._seed = params.seed
-    elif params.enable_tf_data_service:
-      self._seed = _get_random_integer()
-    else:
-      self._seed = None
-
-    self._enable_tf_data_service = (
-        params.enable_tf_data_service and params.tf_data_service_address)
-    self._tf_data_service_address = params.tf_data_service_address
-    if self._enable_tf_data_service:
-      # Add a random seed as the tf.data service job name suffix, so tf.data
-      # service doesn't reuse the previous state if TPU worker gets preempted.
-      self._tf_data_service_job_name = (
-          params.tf_data_service_job_name + str(self.static_randnum))
-      self._enable_round_robin_tf_data_service = params.get(
-          'enable_round_robin_tf_data_service', False)
-
-  def _match_files(self, input_path: str) -> List[str]:
-    """Matches files from an input_path."""
-    matched_files = []
-    # Read dataset from files.
-    usage = ('`input_path` should be either (1) a str indicating a file '
-             'path/pattern, or (2) a str indicating multiple file '
-             'paths/patterns separated by comma (e.g "a, b, c" or no spaces '
-             '"a,b,c", or (3) a list of str, each of which is a file '
-             'path/pattern or multiple file paths/patterns separated by '
-             'comma, but got: %s')
-    if isinstance(input_path, str):
-      input_path_list = [input_path]
-    elif isinstance(input_path, (list, tuple)):
-      if any(not isinstance(x, str) for x in input_path):
-        raise ValueError(usage % input_path)
-      input_path_list = input_path
-    else:
-      raise ValueError(usage % input_path)
-
-    for input_path in input_path_list:
-      input_patterns = input_path.strip().split(',')
-      for input_pattern in input_patterns:
-        input_pattern = input_pattern.strip()
-        if not input_pattern:
-          continue
-        if '*' in input_pattern or '?' in input_pattern:
-          tmp_matched_files = tf.io.gfile.glob(input_pattern)
-          if not tmp_matched_files:
-            raise ValueError('%s does not match any files.' % input_pattern)
-          matched_files.extend(tmp_matched_files)
-        else:
-          matched_files.append(input_pattern)
-
-    if not matched_files:
-      raise ValueError('%s does not match any files.' % input_path)
-
-    return matched_files
-
-  def _shard_files_then_read(
-      self,
-      matched_files: List[str],
-      dataset_fn,
-      input_context: Optional[tf.distribute.InputContext] = None
-  ) -> tf.data.Dataset:
-    """Shards the data files and then sent a split to every worker to read."""
-    dataset = tf.data.Dataset.from_tensor_slices(matched_files)
-
-    # Shuffle and repeat at file level.
-    # If cache is enabled, `reshuffle_each_iteration` is set to False,
-    # because we will read the same cached data in every iteration anyway.
-    if self._is_training:
-      dataset = dataset.shuffle(
-          len(matched_files),
-          seed=self._seed,
-          reshuffle_each_iteration=True if not self._cache else False)
-
-    # Do not enable sharding if tf.data service is enabled, as sharding will be
-    # handled inside tf.data service.
-    if self._sharding and input_context and (
-        input_context.num_input_pipelines > 1 and
-        not self._enable_tf_data_service):
-      dataset = dataset.shard(input_context.num_input_pipelines,
-                              input_context.input_pipeline_id)
-
-    # If cache is enabled, we will call `repeat()` later after `cache()`.
-    if self._is_training and not self._cache:
-      dataset = dataset.repeat()
-
-    dataset = dataset.interleave(
-        map_func=dataset_fn,
-        cycle_length=self._cycle_length,
-        block_length=self._block_length,
-        num_parallel_calls=(self._cycle_length if self._cycle_length else
-                            tf.data.experimental.AUTOTUNE),
-        deterministic=self._deterministic)
-    return dataset
-
-  def _read_files_then_shard(
-      self,
-      matched_files: List[str],
-      dataset_fn,
-      input_context: Optional[tf.distribute.InputContext] = None
-  ) -> tf.data.Dataset:
-    """Sends all data files to every worker and then shard by data."""
-    dataset = dataset_fn(matched_files)
-
-    # When `input_file` is a path to a single file or the number of files is
-    # less than the number of input pipelines, disable auto sharding
-    # so that same input file is sent to all workers.
-    options = tf.data.Options()
-    options.experimental_distribute.auto_shard_policy = (
-        tf.data.experimental.AutoShardPolicy.OFF)
-    dataset = dataset.with_options(options)
-    # Do not enable sharding if tf.data service is enabled, as sharding will be
-    # handled inside tf.data service.
-    if self._sharding and input_context and (
-        input_context.num_input_pipelines > 1 and
-        not self._enable_tf_data_service):
-      dataset = dataset.shard(input_context.num_input_pipelines,
-                              input_context.input_pipeline_id)
-
-    # If cache is enabled, we will call `repeat()` later after `cache()`.
-    if self._is_training and not self._cache:
-      dataset = dataset.repeat()
-    return dataset
-
-  def _read_tfds(
-      self,
-      input_context: Optional[tf.distribute.InputContext] = None
-  ) -> tf.data.Dataset:
-    """Reads a dataset from tfds."""
-    # No op if exist.
-    self._tfds_builder.download_and_prepare()
-
-    read_config = tfds.ReadConfig(
-        interleave_cycle_length=self._cycle_length,
-        interleave_block_length=self._block_length,
-        input_context=input_context,
-        shuffle_seed=self._seed)
-    decoders = {}
-    if self._tfds_skip_decoding_feature:
-      for skip_feature in self._tfds_skip_decoding_feature.split(','):
-        decoders[skip_feature.strip()] = tfds.decode.SkipDecoding()
-    dataset = self._tfds_builder.as_dataset(
-        split=self._tfds_split,
-        shuffle_files=self._is_training,
-        as_supervised=self._tfds_as_supervised,
-        decoders=decoders,
-        read_config=read_config)
-
-    # If cache is enabled, we will call `repeat()` later after `cache()`.
-    if self._is_training and not self._cache:
-      dataset = dataset.repeat()
-    return dataset
-
-  @property
-  def tfds_info(self) -> tfds.core.DatasetInfo:
-    """Returns TFDS dataset info, if available."""
-    if self._tfds_builder:
-      return self._tfds_builder.info
-    else:
-      raise ValueError('tfds_info is not available, because the dataset '
-                       'is not loaded from tfds.')
-
-  def _read_decode_and_parse_dataset(
-      self,
-      matched_files: List[str],
-      dataset_fn,
-      batch_size: int,
-      input_context: Optional[tf.distribute.InputContext] = None,
-      tfds_builder: bool = False) -> tf.data.Dataset:
-    """Returns a tf.data.Dataset object after reading, decoding, and parsing."""
-    if tfds_builder:
-      dataset = self._read_tfds(input_context)
-    elif len(matched_files) > 1:
-      if input_context and (len(matched_files) <
-                            input_context.num_input_pipelines):
-        logging.warn(
-            'The number of files %d is less than the number of input pipelines '
-            '%d. We will send all input files to every worker. '
-            'Please consider sharding your data into more files.',
-            len(matched_files), input_context.num_input_pipelines)
-        dataset = self._read_files_then_shard(matched_files,
-                                              dataset_fn,
-                                              input_context)
-      else:
-        dataset = self._shard_files_then_read(matched_files,
-                                              dataset_fn,
-                                              input_context)
-    elif len(matched_files) == 1:
-      dataset = self._read_files_then_shard(matched_files,
-                                            dataset_fn,
-                                            input_context)
-    else:
-      raise ValueError('It is unexpected that `tfds_builder` is None and '
-                       'there is also no `matched_files`.')
-
-    # If cache is enabled, we will call `shuffle()` later after `cache()`.
-    if self._is_training and not self._cache:
-      dataset = dataset.shuffle(self._shuffle_buffer_size, seed=self._seed)
-
-    dataset = _maybe_map_fn(dataset, self._decoder_fn)
-    if self._sample_fn is not None:
-      dataset = dataset.apply(self._sample_fn)
-    dataset = _maybe_map_fn(dataset, self._parser_fn)
-
-    if self._cache:
-      dataset = dataset.cache()
-      if self._is_training:
-        dataset = dataset.repeat()
-        dataset = dataset.shuffle(self._shuffle_buffer_size, seed=self._seed)
-
-    if self._transform_and_batch_fn is not None:
-      dataset = self._transform_and_batch_fn(dataset, input_context)
-    else:
-      per_replica_batch_size = input_context.get_per_replica_batch_size(
-          batch_size) if input_context else batch_size
-      dataset = dataset.batch(
-          per_replica_batch_size, drop_remainder=self._drop_remainder
-      )
-
-    return dataset
-
-  def _maybe_apply_data_service(
-      self,
-      dataset: tf.data.Dataset,
-      input_context: Optional[tf.distribute.InputContext] = None
-  ) -> tf.data.Dataset:
-    """Potentially distributes a dataset."""
-    if self._enable_tf_data_service and input_context:
-      if self._enable_round_robin_tf_data_service:
-        replicas_per_input_pipeline = input_context.num_replicas_in_sync // (
-            input_context.num_input_pipelines)
-        base_consumer_index = input_context.input_pipeline_id * (
-            replicas_per_input_pipeline)
-        num_consumers = input_context.num_input_pipelines * (
-            replicas_per_input_pipeline)
-        range_dataset = tf.data.Dataset.range(replicas_per_input_pipeline)
-        dataset = range_dataset.map(lambda i: dataset.apply(  # pylint: disable=g-long-lambda
-            tf.data.experimental.service.distribute(
-                processing_mode='parallel_epochs',
-                service=self._tf_data_service_address,
-                job_name=self._tf_data_service_job_name,
-                consumer_index=base_consumer_index + i,
-                num_consumers=num_consumers)))
-        # Use parallel interleave to read multiple batches from a tf.data
-        # service worker in parallel.
-        dataset = dataset.interleave(
-            lambda x: x,
-            cycle_length=replicas_per_input_pipeline,
-            num_parallel_calls=replicas_per_input_pipeline,
-            deterministic=True)
-      else:
-        dataset = dataset.apply(
-            tf.data.experimental.service.distribute(
-                processing_mode='parallel_epochs',
-                service=self._tf_data_service_address,
-                job_name=self._tf_data_service_job_name))
-    return dataset
-
-  def read(
-      self,
-      input_context: Optional[tf.distribute.InputContext] = None
-  ) -> tf.data.Dataset:
-    """Generates a tf.data.Dataset object."""
-    dataset = self._read_decode_and_parse_dataset(self._matched_files,
-                                                  self._dataset_fn,
-                                                  self._global_batch_size,
-                                                  input_context,
-                                                  self._tfds_builder)
-    dataset = _maybe_map_fn(dataset, self._postprocess_fn)
-    dataset = self._maybe_apply_data_service(dataset, input_context)
-
-    if self._deterministic is not None:
-      options = tf.data.Options()
-      options.experimental_deterministic = self._deterministic
-      dataset = dataset.with_options(options)
-    return dataset.prefetch(tf.data.experimental.AUTOTUNE)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/registry.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/registry.py
deleted file mode 100644
index 193552eae451b518f82ceeac6616d49ec76a2bbf..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/registry.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Registry utility."""
-
-
-def register(registered_collection, reg_key):
-  """Register decorated function or class to collection.
-
-  Register decorated function or class into registered_collection, in a
-  hierarchical order. For example, when reg_key="my_model/my_exp/my_config_0"
-  the decorated function or class is stored under
-  registered_collection["my_model"]["my_exp"]["my_config_0"].
-  This decorator is supposed to be used together with the lookup() function in
-  this file.
-
-  Args:
-    registered_collection: a dictionary. The decorated function or class will be
-      put into this collection.
-    reg_key: The key for retrieving the registered function or class. If reg_key
-      is a string, it can be hierarchical like my_model/my_exp/my_config_0
-  Returns:
-    A decorator function
-  Raises:
-    KeyError: when function or class to register already exists.
-  """
-  def decorator(fn_or_cls):
-    """Put fn_or_cls in the dictionary."""
-    if isinstance(reg_key, str):
-      hierarchy = reg_key.split("/")
-      collection = registered_collection
-      for h_idx, entry_name in enumerate(hierarchy[:-1]):
-        if entry_name not in collection:
-          collection[entry_name] = {}
-        collection = collection[entry_name]
-        if not isinstance(collection, dict):
-          raise KeyError(
-              "Collection path {} at position {} already registered as "
-              "a function or class.".format(entry_name, h_idx))
-      leaf_reg_key = hierarchy[-1]
-    else:
-      collection = registered_collection
-      leaf_reg_key = reg_key
-
-    if leaf_reg_key in collection:
-      raise KeyError("Function or class {} registered multiple times.".format(
-          leaf_reg_key))
-
-    collection[leaf_reg_key] = fn_or_cls
-    return fn_or_cls
-  return decorator
-
-
-def lookup(registered_collection, reg_key):
-  """Lookup and return decorated function or class in the collection.
-
-  Lookup decorated function or class in registered_collection, in a
-  hierarchical order. For example, when
-  reg_key="my_model/my_exp/my_config_0",
-  this function will return
-  registered_collection["my_model"]["my_exp"]["my_config_0"].
-
-  Args:
-    registered_collection: a dictionary. The decorated function or class will be
-      retrieved from this collection.
-    reg_key: The key for retrieving the registered function or class. If reg_key
-      is a string, it can be hierarchical like my_model/my_exp/my_config_0
-  Returns:
-    The registered function or class.
-  Raises:
-    LookupError: when reg_key cannot be found.
-  """
-  if isinstance(reg_key, str):
-    hierarchy = reg_key.split("/")
-    collection = registered_collection
-    for h_idx, entry_name in enumerate(hierarchy):
-      if entry_name not in collection:
-        raise LookupError(
-            "collection path {} at position {} never registered.".format(
-                entry_name, h_idx))
-      collection = collection[entry_name]
-    return collection
-  else:
-    if reg_key not in registered_collection:
-      raise LookupError("registration key {} never registered.".format(reg_key))
-    return registered_collection[reg_key]
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/registry_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/registry_test.py
deleted file mode 100644
index 9d8ca0533d92dd8f83138e69d7dfc701e2f917b1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/registry_test.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for registry."""
-
-import tensorflow as tf
-from official.core import registry
-
-
-class RegistryTest(tf.test.TestCase):
-
-  def test_register(self):
-    collection = {}
-
-    @registry.register(collection, 'functions/func_0')
-    def func_test():
-      pass
-
-    self.assertEqual(registry.lookup(collection, 'functions/func_0'), func_test)
-
-    @registry.register(collection, 'classes/cls_0')
-    class ClassRegistryKey:
-      pass
-
-    self.assertEqual(
-        registry.lookup(collection, 'classes/cls_0'), ClassRegistryKey)
-
-    @registry.register(collection, ClassRegistryKey)
-    class ClassRegistryValue:
-      pass
-
-    self.assertEqual(
-        registry.lookup(collection, ClassRegistryKey), ClassRegistryValue)
-
-  def test_register_hierarchy(self):
-    collection = {}
-
-    @registry.register(collection, 'functions/func_0')
-    def func_test0():
-      pass
-
-    @registry.register(collection, 'func_1')
-    def func_test1():
-      pass
-
-    @registry.register(collection, func_test1)
-    def func_test2():
-      pass
-
-    expected_collection = {
-        'functions': {
-            'func_0': func_test0,
-        },
-        'func_1': func_test1,
-        func_test1: func_test2,
-    }
-    self.assertEqual(collection, expected_collection)
-
-  def test_register_error(self):
-    collection = {}
-
-    @registry.register(collection, 'functions/func_0')
-    def func_test0():  # pylint: disable=unused-variable
-      pass
-
-    with self.assertRaises(KeyError):
-
-      @registry.register(collection, 'functions/func_0/sub_func')
-      def func_test1():  # pylint: disable=unused-variable
-        pass
-
-    with self.assertRaises(LookupError):
-      registry.lookup(collection, 'non-exist')
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/task_factory.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/task_factory.py
deleted file mode 100644
index 56cd92948937db563f6398f98196362ebc008617..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/task_factory.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A global factory to register and access all registered tasks."""
-
-from official.core import registry
-
-_REGISTERED_TASK_CLS = {}
-
-
-# TODO(b/158741360): Add type annotations once pytype checks across modules.
-def register_task_cls(task_config_cls):
-  """Decorates a factory of Tasks for lookup by a subclass of TaskConfig.
-
-  This decorator supports registration of tasks as follows:
-
-  ```
-  @dataclasses.dataclass
-  class MyTaskConfig(TaskConfig):
-    # Add fields here.
-    pass
-
-  @register_task_cls(MyTaskConfig)
-  class MyTask(Task):
-    # Inherits def __init__(self, task_config).
-    pass
-
-  my_task_config = MyTaskConfig()
-  my_task = get_task(my_task_config)  # Returns MyTask(my_task_config).
-  ```
-
-  Besisdes a class itself, other callables that create a Task from a TaskConfig
-  can be decorated by the result of this function, as long as there is at most
-  one registration for each config class.
-
-  Args:
-    task_config_cls: a subclass of TaskConfig (*not* an instance of TaskConfig).
-      Each task_config_cls can only be used for a single registration.
-
-  Returns:
-    A callable for use as class decorator that registers the decorated class
-    for creation from an instance of task_config_cls.
-  """
-  return registry.register(_REGISTERED_TASK_CLS, task_config_cls)
-
-
-def get_task(task_config, **kwargs):
-  """Creates a Task (of suitable subclass type) from task_config."""
-  return get_task_cls(task_config.__class__)(task_config, **kwargs)
-
-
-# The user-visible get_task() is defined after classes have been registered.
-# TODO(b/158741360): Add type annotations once pytype checks across modules.
-def get_task_cls(task_config_cls):
-  task_cls = registry.lookup(_REGISTERED_TASK_CLS, task_config_cls)
-  return task_cls
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_lib.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_lib.py
deleted file mode 100644
index 1a03ecf1cfdb6d5e6bdb06b872f8ddbe5a823799..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_lib.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""TFM common training driver library."""
-# pytype: disable=attribute-error
-import os
-from typing import Any, Mapping, Tuple, Optional
-
-# Import libraries
-from absl import logging
-import orbit
-import tensorflow as tf
-
-from official.core import base_task
-from official.core import base_trainer
-from official.core import config_definitions
-from official.core import train_utils
-
-maybe_create_best_ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter
-
-
-def run_experiment(
-    distribution_strategy: tf.distribute.Strategy,
-    task: base_task.Task,
-    mode: str,
-    params: config_definitions.ExperimentConfig,
-    model_dir: str,
-    run_post_eval: bool = False,
-    save_summary: bool = True,
-    trainer: Optional[base_trainer.Trainer] = None
-) -> Tuple[tf.keras.Model, Mapping[str, Any]]:
-  """Runs train/eval configured by the experiment params.
-
-  Args:
-    distribution_strategy: A distribution distribution_strategy.
-    task: A Task instance.
-    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
-      or 'continuous_eval'.
-    params: ExperimentConfig instance.
-    model_dir: A 'str', a path to store model checkpoints and summaries.
-    run_post_eval: Whether to run post eval once after training, metrics logs
-      are returned.
-    save_summary: Whether to save train and validation summary.
-    trainer: the base_trainer.Trainer instance. It should be created within the
-      strategy.scope().
-
-  Returns:
-    A 2-tuple of (model, eval_logs).
-      model: `tf.keras.Model` instance.
-      eval_logs: returns eval metrics logs when run_post_eval is set to True,
-        otherwise, returns {}.
-  """
-
-  with distribution_strategy.scope():
-    if not trainer:
-      trainer = train_utils.create_trainer(
-          params,
-          task,
-          train='train' in mode,
-          evaluate=('eval' in mode) or run_post_eval,
-          checkpoint_exporter=maybe_create_best_ckpt_exporter(
-              params, model_dir))
-
-  if trainer.checkpoint:
-    checkpoint_manager = tf.train.CheckpointManager(
-        trainer.checkpoint,
-        directory=model_dir,
-        max_to_keep=params.trainer.max_to_keep,
-        step_counter=trainer.global_step,
-        checkpoint_interval=params.trainer.checkpoint_interval,
-        init_fn=trainer.initialize)
-    # Adds recovery handling.
-    trainer.add_recovery(params.trainer, checkpoint_manager=checkpoint_manager)
-  else:
-    checkpoint_manager = None
-
-  controller = orbit.Controller(
-      strategy=distribution_strategy,
-      trainer=trainer if 'train' in mode else None,
-      evaluator=trainer,
-      global_step=trainer.global_step,
-      steps_per_loop=params.trainer.steps_per_loop,
-      checkpoint_manager=checkpoint_manager,
-      summary_dir=os.path.join(model_dir, 'train') if (save_summary) else None,
-      eval_summary_dir=os.path.join(model_dir,
-                                    params.trainer.validation_summary_subdir) if
-      (save_summary) else None,
-      summary_interval=params.trainer.summary_interval if
-      (save_summary) else None)
-
-  logging.info('Starts to execute mode: %s', mode)
-  with distribution_strategy.scope():
-    if mode == 'train':
-      controller.train(steps=params.trainer.train_steps)
-    elif mode == 'train_and_eval':
-      controller.train_and_evaluate(
-          train_steps=params.trainer.train_steps,
-          eval_steps=params.trainer.validation_steps,
-          eval_interval=params.trainer.validation_interval)
-    elif mode == 'eval':
-      controller.evaluate(steps=params.trainer.validation_steps)
-    elif mode == 'continuous_eval':
-
-      def timeout_fn():
-        if trainer.global_step.numpy() >= params.trainer.train_steps:
-          return True
-        return False
-
-      controller.evaluate_continuously(
-          steps=params.trainer.validation_steps,
-          timeout=params.trainer.continuous_eval_timeout,
-          timeout_fn=timeout_fn)
-    else:
-      raise NotImplementedError('The mode is not implemented: %s' % mode)
-
-  num_params = train_utils.try_count_params(trainer.model)
-  if num_params is not None:
-    logging.info('Number of trainable params in model: %f Millions.',
-                 num_params / 10.**6)
-
-  if run_post_eval:
-    with distribution_strategy.scope():
-      return trainer.model, trainer.evaluate(
-          tf.convert_to_tensor(params.trainer.validation_steps))
-  else:
-    return trainer.model, {}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_lib_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_lib_test.py
deleted file mode 100644
index 71f5ac5c41c41a1d2953fbaf4461ec4a33b26dc9..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_lib_test.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for train_ctl_lib."""
-import json
-import os
-
-from absl import flags
-from absl.testing import flagsaver
-from absl.testing import parameterized
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.common import flags as tfm_flags
-# pylint: disable=unused-import
-from official.common import registry_imports
-# pylint: enable=unused-import
-from official.core import task_factory
-from official.core import train_lib
-from official.core import train_utils
-
-FLAGS = flags.FLAGS
-
-tfm_flags.define_flags()
-
-
-class TrainTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(TrainTest, self).setUp()
-    self._test_config = {
-        'trainer': {
-            'checkpoint_interval': 10,
-            'steps_per_loop': 10,
-            'summary_interval': 10,
-            'train_steps': 10,
-            'validation_steps': 5,
-            'validation_interval': 10,
-            'continuous_eval_timeout': 1,
-            'validation_summary_subdir': 'validation',
-            'optimizer_config': {
-                'optimizer': {
-                    'type': 'sgd',
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                }
-            }
-        },
-    }
-
-  @combinations.generate(
-      combinations.combine(
-          distribution_strategy=[
-              strategy_combinations.default_strategy,
-              strategy_combinations.cloud_tpu_strategy,
-              strategy_combinations.one_device_strategy_gpu,
-          ],
-          flag_mode=['train', 'eval', 'train_and_eval'],
-          run_post_eval=[True, False]))
-  def test_end_to_end(self, distribution_strategy, flag_mode, run_post_eval):
-    model_dir = self.get_temp_dir()
-    flags_dict = dict(
-        experiment='mock',
-        mode=flag_mode,
-        model_dir=model_dir,
-        params_override=json.dumps(self._test_config))
-    with flagsaver.flagsaver(**flags_dict):
-      params = train_utils.parse_configuration(flags.FLAGS)
-      train_utils.serialize_config(params, model_dir)
-      with distribution_strategy.scope():
-        task = task_factory.get_task(params.task, logging_dir=model_dir)
-
-      _, logs = train_lib.run_experiment(
-          distribution_strategy=distribution_strategy,
-          task=task,
-          mode=flag_mode,
-          params=params,
-          model_dir=model_dir,
-          run_post_eval=run_post_eval)
-
-    if 'eval' in flag_mode:
-      self.assertTrue(
-          tf.io.gfile.exists(
-              os.path.join(model_dir,
-                           params.trainer.validation_summary_subdir)))
-    if run_post_eval:
-      self.assertNotEmpty(logs)
-    else:
-      self.assertEmpty(logs)
-    self.assertNotEmpty(
-        tf.io.gfile.glob(os.path.join(model_dir, 'params.yaml')))
-    if flag_mode == 'eval':
-      return
-    self.assertNotEmpty(
-        tf.io.gfile.glob(os.path.join(model_dir, 'checkpoint')))
-    # Tests continuous evaluation.
-    _, logs = train_lib.run_experiment(
-        distribution_strategy=distribution_strategy,
-        task=task,
-        mode='continuous_eval',
-        params=params,
-        model_dir=model_dir,
-        run_post_eval=run_post_eval)
-    print(logs)
-
-  def test_parse_configuration(self):
-    model_dir = self.get_temp_dir()
-    flags_dict = dict(
-        experiment='mock',
-        mode='train',
-        model_dir=model_dir,
-        params_override=json.dumps(self._test_config))
-    with flagsaver.flagsaver(**flags_dict):
-      params = train_utils.parse_configuration(flags.FLAGS, lock_return=True)
-      with self.assertRaises(ValueError):
-        params.override({'task': {'init_checkpoint': 'Foo'}})
-
-      params = train_utils.parse_configuration(flags.FLAGS, lock_return=False)
-      params.override({'task': {'init_checkpoint': 'Bar'}})
-      self.assertEqual(params.task.init_checkpoint, 'Bar')
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_utils.py
deleted file mode 100644
index 4e9ff0274fb5aaaa644880aacf7ee1d877918b24..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_utils.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Training utils."""
-import copy
-import json
-import os
-import pprint
-from typing import Any, Callable, Dict, List, Optional
-
-from absl import logging
-import dataclasses
-import gin
-import orbit
-import tensorflow as tf
-
-from official.core import base_task
-from official.core import base_trainer
-from official.core import config_definitions
-from official.core import exp_factory
-from official.modeling import hyperparams
-
-
-def get_leaf_nested_dict(d: Dict[str, Any], keys: List[str]) -> Dict[str, Any]:
-  """Get leaf from a dictionary with arbitrary depth with a list of keys.
-
-  Args:
-    d: The dictionary to extract value from.
-    keys: The list of keys to extract values recursively.
-
-  Returns:
-    The value of the leaf.
-
-  Raises:
-    KeyError: If the value of keys extracted is a dictionary.
-  """
-  leaf = d
-  for k in keys:
-    if not isinstance(leaf, dict) or k not in leaf:
-      raise KeyError(
-          'Path not exist while traversing the dictionary: d with keys'
-          ': %s.' % keys)
-    leaf = leaf[k]
-
-  if isinstance(leaf, dict):
-    raise KeyError('The value extracted with keys: %s is not a leaf of the '
-                   'dictionary: %s.' % (keys, d))
-  return leaf
-
-
-def cast_leaf_nested_dict(d: Dict[str, Any],
-                          cast_fn: Callable[[Any], Any]) -> Dict[str, Any]:
-  """Cast the leaves of a dictionary with arbitrary depth in place.
-
-  Args:
-    d: The dictionary to extract value from.
-    cast_fn: The casting function.
-
-  Returns:
-    A dictionray with the same structure as d.
-  """
-  for key, value in d.items():
-    if isinstance(value, dict):
-      d[key] = cast_leaf_nested_dict(value, cast_fn)
-    else:
-      d[key] = cast_fn(value)
-  return d
-
-
-def maybe_create_best_ckpt_exporter(params: config_definitions.ExperimentConfig,
-                                    data_dir: str) -> Any:
-  """Maybe create a BestCheckpointExporter object, according to the config."""
-  export_subdir = params.trainer.best_checkpoint_export_subdir
-  metric_name = params.trainer.best_checkpoint_eval_metric
-  metric_comp = params.trainer.best_checkpoint_metric_comp
-  if data_dir and export_subdir and metric_name:
-    best_ckpt_dir = os.path.join(data_dir, export_subdir)
-    best_ckpt_exporter = BestCheckpointExporter(best_ckpt_dir, metric_name,
-                                                metric_comp)
-    logging.info(
-        'Created the best checkpoint exporter. '
-        'data_dir: %s, export_subdir: %s, metric_name: %s', data_dir,
-        export_subdir, metric_name)
-  else:
-    best_ckpt_exporter = None
-
-  return best_ckpt_exporter
-
-
-# TODO(b/180147589): Add tests for this module.
-class BestCheckpointExporter:
-  """Keeps track of the best result, and saves its checkpoint.
-
-  Orbit will support an API for checkpoint exporter. This class will be used
-  together with orbit once this functionality is ready.
-  """
-
-  def __init__(self, export_dir: str, metric_name: str, metric_comp: str):
-    """Initialization.
-
-    Args:
-      export_dir: The directory that will contain exported checkpoints.
-      metric_name: Indicates which metric to look at, when determining which
-        result is better. If eval_logs being passed to maybe_export_checkpoint
-        is a nested dictionary, use `|` as a seperator for different layers.
-      metric_comp: Indicates how to compare results. Either `lower` or `higher`.
-    """
-    self._export_dir = export_dir
-    self._metric_name = metric_name.split('|')
-    self._metric_comp = metric_comp
-    if self._metric_comp not in ('lower', 'higher'):
-      raise ValueError('best checkpoint metric comp must be one of '
-                       'higher, lower. Got: {}'.format(self._metric_comp))
-    tf.io.gfile.makedirs(os.path.dirname(self.best_ckpt_logs_path))
-    self._best_ckpt_logs = self._maybe_load_best_eval_metric()
-    self._checkpoint_manager = None
-
-  def _get_checkpoint_manager(self, checkpoint):
-    """Gets an existing checkpoint manager or creates a new one."""
-    if self._checkpoint_manager is None or (self._checkpoint_manager.checkpoint
-                                            != checkpoint):
-      logging.info('Creates a new checkpoint manager.')
-      self._checkpoint_manager = tf.train.CheckpointManager(
-          checkpoint,
-          directory=self._export_dir,
-          max_to_keep=1,
-          checkpoint_name='best_ckpt')
-
-    return self._checkpoint_manager
-
-  def maybe_export_checkpoint(self, checkpoint, eval_logs, global_step):
-    logging.info('[BestCheckpointExporter] received eval_logs: %s, at step: %d',
-                 eval_logs, global_step)
-    if self._best_ckpt_logs is None or self._new_metric_is_better(
-        self._best_ckpt_logs, eval_logs):
-      self._best_ckpt_logs = eval_logs
-      self._export_best_eval_metric(checkpoint, self._best_ckpt_logs,
-                                    global_step)
-
-  def _maybe_load_best_eval_metric(self):
-    if not tf.io.gfile.exists(self.best_ckpt_logs_path):
-      return None
-    with tf.io.gfile.GFile(self.best_ckpt_logs_path, 'r') as reader:
-      return json.loads(reader.read())
-
-  def _new_metric_is_better(self, old_logs, new_logs):
-    """Check if the metric in new_logs is better than the metric in old_logs."""
-    old_value = float(
-        orbit.utils.get_value(
-            get_leaf_nested_dict(old_logs, self._metric_name)))
-    new_value = float(
-        orbit.utils.get_value(
-            get_leaf_nested_dict(new_logs, self._metric_name)))
-
-    logging.info('[BestCheckpointExporter] comparing results. old: %f, new: %f',
-                 old_value, new_value)
-    if self._metric_comp == 'higher':
-      if new_value > old_value:
-        logging.info('[BestCheckpointExporter] '
-                     'the new number is better since it is higher.')
-        return True
-    else:  # self._metric_comp == 'lower':
-      if new_value < old_value:
-        logging.info('[BestCheckpointExporter] '
-                     'the new number is better since it is lower.')
-        return True
-    return False
-
-  def _export_best_eval_metric(self, checkpoint, eval_logs, global_step):
-    """Export evaluation results of the best checkpoint into a json file."""
-    eval_logs_ext = copy.copy(eval_logs)
-    eval_logs_ext['best_ckpt_global_step'] = global_step
-    eval_logs_ext = cast_leaf_nested_dict(
-        eval_logs_ext, lambda x: float(orbit.utils.get_value(x)))
-    # Saving json file is very fast.
-    with tf.io.gfile.GFile(self.best_ckpt_logs_path, 'w') as writer:
-      writer.write(json.dumps(eval_logs_ext, indent=4) + '\n')
-
-    self._get_checkpoint_manager(checkpoint).save()
-
-  @property
-  def best_ckpt_logs(self):
-    return self._best_ckpt_logs
-
-  @property
-  def best_ckpt_logs_path(self):
-    return os.path.join(self._export_dir, 'info.json')
-
-  @property
-  def best_ckpt_path(self):
-    """Returns the best ckpt path or None if there is no ckpt yet."""
-    return tf.train.latest_checkpoint(self._export_dir)
-
-
-@gin.configurable
-def create_trainer(params: config_definitions.ExperimentConfig,
-                   task: base_task.Task,
-                   train: bool,
-                   evaluate: bool,
-                   checkpoint_exporter: Optional[BestCheckpointExporter] = None,
-                   trainer_cls=base_trainer.Trainer) -> base_trainer.Trainer:
-  """Create trainer."""
-  logging.info('Running default trainer.')
-  model = task.build_model()
-  optimizer = task.create_optimizer(params.trainer.optimizer_config,
-                                    params.runtime)
-  return trainer_cls(
-      params,
-      task,
-      model=model,
-      optimizer=optimizer,
-      train=train,
-      evaluate=evaluate,
-      checkpoint_exporter=checkpoint_exporter)
-
-
-@dataclasses.dataclass
-class ParseConfigOptions:
-  """Use this dataclass instead of FLAGS to customize parse_configuration()."""
-  experiment: str
-  config_file: List[str]
-  tpu: str = ''
-  tf_data_service: str = ''
-  params_override: str = ''
-
-  def __contains__(self, name):
-    return name in dataclasses.asdict(self)
-
-
-def parse_configuration(flags_obj, lock_return=True, print_return=True):
-  """Parses ExperimentConfig from flags."""
-
-  # 1. Get the default config from the registered experiment.
-  params = exp_factory.get_exp_config(flags_obj.experiment)
-
-  # 2. Get the first level of override from `--config_file`.
-  #    `--config_file` is typically used as a template that specifies the common
-  #    override for a particular experiment.
-  for config_file in flags_obj.config_file or []:
-    params = hyperparams.override_params_dict(
-        params, config_file, is_strict=True)
-
-  # 3. Override the TPU address and tf.data service address.
-  params.override({
-      'runtime': {
-          'tpu': flags_obj.tpu,
-      },
-  })
-  if ('tf_data_service' in flags_obj and flags_obj.tf_data_service and
-      isinstance(params.task, config_definitions.TaskConfig)):
-    params.override({
-        'task': {
-            'train_data': {
-                'tf_data_service_address': flags_obj.tf_data_service,
-            },
-            'validation_data': {
-                'tf_data_service_address': flags_obj.tf_data_service,
-            }
-        }
-    })
-
-  # 4. Get the second level of override from `--params_override`.
-  #    `--params_override` is typically used as a further override over the
-  #    template. For example, one may define a particular template for training
-  #    ResNet50 on ImageNet in a config file and pass it via `--config_file`,
-  #    then define different learning rates and pass it via `--params_override`.
-  if flags_obj.params_override:
-    params = hyperparams.override_params_dict(
-        params, flags_obj.params_override, is_strict=True)
-
-  params.validate()
-  if lock_return:
-    params.lock()
-
-  if print_return:
-    pp = pprint.PrettyPrinter()
-    logging.info('Final experiment parameters: %s',
-                 pp.pformat(params.as_dict()))
-
-  return params
-
-
-def serialize_config(params: config_definitions.ExperimentConfig,
-                     model_dir: str):
-  """Serializes and saves the experiment config."""
-  params_save_path = os.path.join(model_dir, 'params.yaml')
-  logging.info('Saving experiment configuration to %s', params_save_path)
-  tf.io.gfile.makedirs(model_dir)
-  hyperparams.save_params_dict_to_yaml(params, params_save_path)
-
-
-def save_gin_config(filename_surfix: str, model_dir: str):
-  """Serializes and saves the experiment config."""
-  gin_save_path = os.path.join(
-      model_dir, 'operative_config.{}.gin'.format(filename_surfix))
-  logging.info('Saving gin configurations to %s', gin_save_path)
-  tf.io.gfile.makedirs(model_dir)
-  with tf.io.gfile.GFile(gin_save_path, 'w') as f:
-    f.write(gin.operative_config_str())
-
-
-def read_global_step_from_checkpoint(ckpt_file_path):
-  """Read global step from checkpoint, or get global step from its filename."""
-  global_step = tf.Variable(-1, dtype=tf.int64)
-  ckpt = tf.train.Checkpoint(global_step=global_step)
-  try:
-    ckpt.restore(ckpt_file_path).expect_partial()
-    global_step_maybe_restored = global_step.numpy()
-  except tf.errors.InvalidArgumentError:
-    global_step_maybe_restored = -1
-
-  if global_step_maybe_restored == -1:
-    raise ValueError('global_step not found in checkpoint {}. '
-                     'If you want to run finetune eval jobs, you need to '
-                     'make sure that your pretrain model writes '
-                     'global_step in its checkpoints.'.format(ckpt_file_path))
-  global_step_restored = global_step.numpy()
-  logging.info('get global_step %d from checkpoint %s', global_step_restored,
-               ckpt_file_path)
-  return global_step_restored
-
-
-def write_json_summary(log_dir, global_step, eval_metrics):
-  """Dump evaluation metrics to json file."""
-  serializable_dict = {}
-  for name, value in eval_metrics.items():
-    if hasattr(value, 'numpy'):
-      serializable_dict[name] = str(value.numpy())
-    else:
-      serializable_dict[name] = str(value)
-  output_json = os.path.join(log_dir, 'metrics-{}.json'.format(global_step))
-  logging.info('Evaluation results at pretrain step %d: %s', global_step,
-               serializable_dict)
-  with tf.io.gfile.GFile(output_json, 'w') as writer:
-    writer.write(json.dumps(serializable_dict, indent=4) + '\n')
-
-
-def write_summary(summary_writer, global_step, eval_metrics):
-  """Write evaluation metrics to TF summary."""
-  numeric_dict = {}
-  for name, value in eval_metrics.items():
-    numeric_dict[name] = float(orbit.utils.get_value(value))
-  with summary_writer.as_default():
-    for name, value in numeric_dict.items():
-      tf.summary.scalar(name, value, step=global_step)
-    summary_writer.flush()
-
-
-def remove_ckpts(model_dir):
-  """Remove model checkpoints, so we can restart."""
-  ckpts = os.path.join(model_dir, 'ckpt-*')
-  logging.info('removing checkpoint files %s', ckpts)
-  for file_to_remove in tf.io.gfile.glob(ckpts):
-    tf.io.gfile.rmtree(file_to_remove)
-
-  file_to_remove = os.path.join(model_dir, 'checkpoint')
-  if tf.io.gfile.exists(file_to_remove):
-    tf.io.gfile.remove(file_to_remove)
-
-
-def try_count_params(model: tf.keras.Model):
-  """Count the number of parameters if model is possible.
-
-  Args:
-    model: Try to count the number of params in this model.
-
-  Returns:
-    The number of parameters or None.
-  """
-  if hasattr(model, 'count_params'):
-    try:
-      return model.count_params()
-    except ValueError:
-      logging.info('Number of trainable params unknown, because the build() '
-                   'methods in keras layers were not called. This is probably '
-                   'because the model was not feed any input, e.g., the max '
-                   'train step already reached before this run.')
-      return None
-  return None
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_utils_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_utils_test.py
deleted file mode 100644
index b6b607848b99dfad4aa4100897cfbe8ebf8ea361..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/core/train_utils_test.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.core.train_utils."""
-
-import tensorflow as tf
-
-from official.core import train_utils
-
-
-class TrainUtilsTest(tf.test.TestCase):
-
-  def test_get_leaf_nested_dict(self):
-    d = {'a': {'i': {'x': 5}}}
-    self.assertEqual(train_utils.get_leaf_nested_dict(d, ['a', 'i', 'x']), 5)
-
-  def test_get_leaf_nested_dict_not_leaf(self):
-    with self.assertRaisesRegex(KeyError, 'The value extracted with keys.*'):
-      d = {'a': {'i': {'x': 5}}}
-      train_utils.get_leaf_nested_dict(d, ['a', 'i'])
-
-  def test_get_leaf_nested_dict_path_not_exist_missing_key(self):
-    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
-      d = {'a': {'i': {'x': 5}}}
-      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'y'])
-
-  def test_get_leaf_nested_dict_path_not_exist_out_of_range(self):
-    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
-      d = {'a': {'i': {'x': 5}}}
-      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'z'])
-
-  def test_get_leaf_nested_dict_path_not_exist_meets_leaf(self):
-    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
-      d = {'a': {'i': 5}}
-      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'z'])
-
-  def test_cast_leaf_nested_dict(self):
-    d = {'a': {'i': {'x': '123'}}, 'b': 456.5}
-    d = train_utils.cast_leaf_nested_dict(d, int)
-    self.assertEqual(d['a']['i']['x'], 123)
-    self.assertEqual(d['b'], 456)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/__init__.py
deleted file mode 100644
index 7f956f13392054a4fc63d3a37bd7343077b15e29..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/__init__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Activations package definition."""
-from official.modeling.activations.gelu import gelu
-from official.modeling.activations.relu import relu6
-from official.modeling.activations.sigmoid import hard_sigmoid
-from official.modeling.activations.swish import hard_swish
-from official.modeling.activations.swish import identity
-from official.modeling.activations.swish import simple_swish
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/gelu.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/gelu.py
deleted file mode 100644
index d32acd1d65db29422f1e1a84c8eb523176bf56a1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/gelu.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-"""Gaussian error linear unit."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import npu_device
-import math
-import tensorflow as tf
-import common_flags
-from tensorflow.python.framework import ops
-from npu_device.npu_device import gen_npu_ops as npu_aicore_ops
-from absl import flags
-
-FLAGS=flags.FLAGS
-
-
-#@ops.RegisterGradient("FastGelu")
-def _fast_gelu_grad(op,grad):
-  """ The gradient for fastgelu
-
-  Args:
-    op:The fastgelu operations that we are differentiating,which we can us to find the inputs and outputs of the original op.
-    grad: Gradient with respect to the output of the fast_gelu op.
-
-  Returns:
-    Gradient with respect to the input of fast_gelu
-  """
-  return [npu_aicore_ops.fast_gelu_grad(grad,op.inputs[0])]
-
-grad_registry_list = ops.gradient_registry.list()
-if not hasattr(npu_device.ops, 'gelu') and "FastGelu" not in grad_registry_list:
-  ops.RegisterGradient("FastGelu")(_fast_gelu_grad)
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-def gelu(x):
-  """Gaussian Error Linear Unit.
-
-  Original paper: https://arxiv.org/abs/1606.08415
-  The approximate version is faster.
-
-  Args:
-    x: float Tensor to perform activation.
-
-  Returns:
-    `x` with the GELU activation applied.
-  """
-  if FLAGS.use_fastgelu:
-    if not hasattr(npu_device.ops, 'gelu'):
-      return npu_device.gen_npu_ops.fast_gelu(x)
-    else:
-      fast_gelu = getattr(npu_device.ops, 'gelu')
-      return fast_gelu(x)  
-    #return npu_aicore_ops.fast_gelu(x)
-  else:
-    cdf = 0.5 * (1.0 + tf.tanh(
-        (math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3)))))
-    return x * cdf
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/gelu_origin.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/gelu_origin.py
deleted file mode 100644
index 39348061e57edc22857c2984f297d9fc23d365e8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/gelu_origin.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Gaussian error linear unit."""
-
-import tensorflow as tf
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-def gelu(x):
-  """Gaussian Error Linear Unit.
-
-  This is a smoother version of the RELU.
-  Original paper: https://arxiv.org/abs/1606.08415
-  Args:
-    x: float Tensor to perform activation.
-
-  Returns:
-    `x` with the GELU activation applied.
-  """
-  return tf.keras.activations.gelu(x, approximate=True)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/gelu_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/gelu_test.py
deleted file mode 100644
index f833b655066d8bc19334ca86905955876218d01f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/gelu_test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for the Gaussian error linear unit."""
-
-import tensorflow as tf
-
-from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.modeling import activations
-
-
-@keras_parameterized.run_all_keras_modes
-class GeluTest(keras_parameterized.TestCase):
-
-  def test_gelu(self):
-    expected_data = [[0.14967535, 0., -0.10032465],
-                     [-0.15880796, -0.04540223, 2.9963627]]
-    gelu_data = activations.gelu([[.25, 0, -.25], [-1, -2, 3]])
-    self.assertAllClose(expected_data, gelu_data)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/relu.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/relu.py
deleted file mode 100644
index 72b2ffb22baca47d5529940fc3c8c3d6a3ec5b9f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/relu.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Customized Relu activation."""
-
-import tensorflow as tf
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-def relu6(features):
-  """Computes the Relu6 activation function.
-
-  Args:
-    features: A `Tensor` representing preactivation values.
-
-  Returns:
-    The activation value.
-  """
-  features = tf.convert_to_tensor(features)
-  return tf.nn.relu6(features)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/relu_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/relu_test.py
deleted file mode 100644
index 5352de548da25abba4361ee16d9130415dc821df..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/relu_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for the customized Relu activation."""
-
-import tensorflow as tf
-
-from tensorflow.python.keras import \
-  keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.modeling import activations
-
-
-@keras_parameterized.run_all_keras_modes
-class CustomizedReluTest(keras_parameterized.TestCase):
-
-  def test_relu6(self):
-    features = [[.25, 0, -.25], [-1, -2, 3]]
-    customized_relu6_data = activations.relu6(features)
-    relu6_data = tf.nn.relu6(features)
-    self.assertAllClose(customized_relu6_data, relu6_data)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/sigmoid.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/sigmoid.py
deleted file mode 100644
index a331d30b747ad9943c363e185658e2af37b2d423..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/sigmoid.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Customized Sigmoid activation."""
-
-import tensorflow as tf
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-def hard_sigmoid(features):
-  """Computes the hard sigmoid activation function.
-
-  Args:
-    features: A `Tensor` representing preactivation values.
-
-  Returns:
-    The activation value.
-  """
-  features = tf.convert_to_tensor(features)
-  return tf.nn.relu6(features + tf.constant(3.)) * 0.16667
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/sigmoid_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/sigmoid_test.py
deleted file mode 100644
index feeb49b17d263584ba3f078143750012be83d25c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/sigmoid_test.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for the customized Sigmoid activation."""
-
-import numpy as np
-import tensorflow as tf
-
-from keras import \
-  keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.modeling import activations
-
-
-@keras_parameterized.run_all_keras_modes
-class CustomizedSigmoidTest(keras_parameterized.TestCase):
-
-  def _hard_sigmoid_nn(self, x):
-    x = np.float32(x)
-    return tf.nn.relu6(x + 3.) * 0.16667
-
-  def test_hard_sigmoid(self):
-    features = [[.25, 0, -.25], [-1, -2, 3]]
-    customized_hard_sigmoid_data = activations.hard_sigmoid(features)
-    sigmoid_data = self._hard_sigmoid_nn(features)
-    self.assertAllClose(customized_hard_sigmoid_data, sigmoid_data)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/swish.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/swish.py
deleted file mode 100644
index 7a6420e52509cb1936ea147ef09f865029408f8c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/swish.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Customized Swish activation."""
-
-import tensorflow as tf
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-def simple_swish(features):
-  """Computes the Swish activation function.
-
-  The tf.nn.swish operation uses a custom gradient to reduce memory usage.
-  Since saving custom gradients in SavedModel is currently not supported, and
-  one would not be able to use an exported TF-Hub module for fine-tuning, we
-  provide this wrapper that can allow to select whether to use the native
-  TensorFlow swish operation, or whether to use a customized operation that
-  has uses default TensorFlow gradient computation.
-
-  Args:
-    features: A `Tensor` representing preactivation values.
-
-  Returns:
-    The activation value.
-  """
-  features = tf.convert_to_tensor(features)
-  return features * tf.nn.sigmoid(features)
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-def hard_swish(features):
-  """Computes a hard version of the swish function.
-
-  This operation can be used to reduce computational cost and improve
-  quantization for edge devices.
-
-  Args:
-    features: A `Tensor` representing preactivation values.
-
-  Returns:
-    The activation value.
-  """
-  features = tf.convert_to_tensor(features)
-  return features * tf.nn.relu6(features + tf.constant(3.)) * (1. / 6.)
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-def identity(features):
-  """Computes the identity function.
-
-  Useful for helping in quantization.
-
-  Args:
-    features: A `Tensor` representing preactivation values.
-
-  Returns:
-    The activation value.
-  """
-  features = tf.convert_to_tensor(features)
-  return tf.identity(features)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/swish_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/swish_test.py
deleted file mode 100644
index c4e4b5f0c1a7734ca4e6a885ba04fee63ff438b0..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/activations/swish_test.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for the customized Swish activation."""
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.modeling import activations
-
-
-@keras_parameterized.run_all_keras_modes
-class CustomizedSwishTest(keras_parameterized.TestCase):
-
-  def _hard_swish_np(self, x):
-    x = np.float32(x)
-    return x * np.clip(x + 3, 0, 6) / 6
-
-  def test_simple_swish(self):
-    features = [[.25, 0, -.25], [-1, -2, 3]]
-    customized_swish_data = activations.simple_swish(features)
-    swish_data = tf.nn.swish(features)
-    self.assertAllClose(customized_swish_data, swish_data)
-
-  def test_hard_swish(self):
-    features = [[.25, 0, -.25], [-1, -2, 3]]
-    customized_swish_data = activations.hard_swish(features)
-    swish_data = self._hard_swish_np(features)
-    self.assertAllClose(customized_swish_data, swish_data)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/__init__.py
deleted file mode 100644
index da270edaeea8a43e1e9c4e0a5fe0fd63e4de61d0..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Hyperparams package definition."""
-# pylint: disable=g-multiple-import
-from official.modeling.hyperparams.base_config import *
-from official.modeling.hyperparams.oneof import *
-from official.modeling.hyperparams.params_dict import *
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/base_config.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/base_config.py
deleted file mode 100644
index 03f8edb44bf4d64321dee418165bf1daaa8390b1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/base_config.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Base configurations to standardize experiments."""
-
-import copy
-import functools
-from typing import Any, List, Mapping, Optional, Type
-from absl import logging
-
-import dataclasses
-import tensorflow as tf
-import yaml
-
-from official.modeling.hyperparams import params_dict
-
-
-@dataclasses.dataclass
-class Config(params_dict.ParamsDict):
-  """The base configuration class that supports YAML/JSON based overrides.
-
-  Because of YAML/JSON serialization limitations, some semantics of dataclass
-  are not supported:
-  * It recursively enforces a allowlist of basic types and container types, so
-    it avoids surprises with copy and reuse caused by unanticipated types.
-  * Warning: it converts Dict to `Config` even within sequences,
-    e.g. for config = Config({'key': [([{'a': 42}],)]),
-         type(config.key[0][0][0]) is Config rather than dict.
-    If you define/annotate some field as Dict, the field will convert to a
-    `Config` instance and lose the dictionary type.
-  """
-
-  # It's safe to add bytes and other immutable types here.
-  IMMUTABLE_TYPES = (str, int, float, bool, type(None))
-  # It's safe to add set, frozenset and other collections here.
-  SEQUENCE_TYPES = (list, tuple)
-
-  default_params: dataclasses.InitVar[Optional[Mapping[str, Any]]] = None
-  restrictions: dataclasses.InitVar[Optional[List[str]]] = None
-
-  @classmethod
-  def _isvalidsequence(cls, v):
-    """Check if the input values are valid sequences.
-
-    Args:
-      v: Input sequence.
-
-    Returns:
-      True if the sequence is valid. Valid sequence includes the sequence
-      type in cls.SEQUENCE_TYPES and element type is in cls.IMMUTABLE_TYPES or
-      is dict or ParamsDict.
-    """
-    if not isinstance(v, cls.SEQUENCE_TYPES):
-      return False
-    return (all(isinstance(e, cls.IMMUTABLE_TYPES) for e in v) or
-            all(isinstance(e, dict) for e in v) or
-            all(isinstance(e, params_dict.ParamsDict) for e in v))
-
-  @classmethod
-  def _import_config(cls, v, subconfig_type):
-    """Returns v with dicts converted to Configs, recursively."""
-    if not issubclass(subconfig_type, params_dict.ParamsDict):
-      raise TypeError(
-          'Subconfig_type should be subclass of ParamsDict, found {!r}'.format(
-              subconfig_type))
-    if isinstance(v, cls.IMMUTABLE_TYPES):
-      return v
-    elif isinstance(v, cls.SEQUENCE_TYPES):
-      # Only support one layer of sequence.
-      if not cls._isvalidsequence(v):
-        raise TypeError(
-            'Invalid sequence: only supports single level {!r} of {!r} or '
-            'dict or ParamsDict found: {!r}'.format(cls.SEQUENCE_TYPES,
-                                                    cls.IMMUTABLE_TYPES, v))
-      import_fn = functools.partial(
-          cls._import_config, subconfig_type=subconfig_type)
-      return type(v)(map(import_fn, v))
-    elif isinstance(v, params_dict.ParamsDict):
-      # Deepcopy here is a temporary solution for preserving type in nested
-      # Config object.
-      return copy.deepcopy(v)
-    elif isinstance(v, dict):
-      return subconfig_type(v)
-    else:
-      raise TypeError('Unknown type: {!r}'.format(type(v)))
-
-  @classmethod
-  def _export_config(cls, v):
-    """Returns v with Configs converted to dicts, recursively."""
-    if isinstance(v, cls.IMMUTABLE_TYPES):
-      return v
-    elif isinstance(v, cls.SEQUENCE_TYPES):
-      return type(v)(map(cls._export_config, v))
-    elif isinstance(v, params_dict.ParamsDict):
-      return v.as_dict()
-    elif isinstance(v, dict):
-      raise TypeError('dict value not supported in converting.')
-    else:
-      raise TypeError('Unknown type: {!r}'.format(type(v)))
-
-  @classmethod
-  def _get_subconfig_type(cls, k) -> Type[params_dict.ParamsDict]:
-    """Get element type by the field name.
-
-    Args:
-      k: the key/name of the field.
-
-    Returns:
-      Config as default. If a type annotation is found for `k`,
-      1) returns the type of the annotation if it is subtype of ParamsDict;
-      2) returns the element type if the annotation of `k` is List[SubType]
-         or Tuple[SubType].
-    """
-    subconfig_type = Config
-    if k in cls.__annotations__:
-      # Directly Config subtype.
-      type_annotation = cls.__annotations__[k]  # pytype: disable=invalid-annotation
-      if (isinstance(type_annotation, type) and
-          issubclass(type_annotation, Config)):
-        subconfig_type = cls.__annotations__[k]  # pytype: disable=invalid-annotation
-      else:
-        # Check if the field is a sequence of subtypes.
-        field_type = getattr(type_annotation, '__origin__', type(None))
-        if (isinstance(field_type, type) and
-            issubclass(field_type, cls.SEQUENCE_TYPES)):
-          element_type = getattr(type_annotation, '__args__', [type(None)])[0]
-          subconfig_type = (
-              element_type if issubclass(element_type, params_dict.ParamsDict)
-              else subconfig_type)
-    return subconfig_type
-
-  def __post_init__(self, default_params, restrictions, *args, **kwargs):
-    super().__init__(
-        default_params=default_params,
-        restrictions=restrictions,
-        *args,
-        **kwargs)
-
-  def _set(self, k, v):
-    """Overrides same method in ParamsDict.
-
-    Also called by ParamsDict methods.
-
-    Args:
-      k: key to set.
-      v: value.
-
-    Raises:
-      RuntimeError
-    """
-    subconfig_type = self._get_subconfig_type(k)
-
-    def is_null(k):
-      if k not in self.__dict__ or not self.__dict__[k]:
-        return True
-      return False
-
-    if isinstance(v, dict):
-      if is_null(k):
-        # If the key not exist or the value is None, a new Config-family object
-        # sould be created for the key.
-        self.__dict__[k] = subconfig_type(v)
-      else:
-        self.__dict__[k].override(v)
-    elif not is_null(k) and isinstance(v, self.SEQUENCE_TYPES) and all(
-        [not isinstance(e, self.IMMUTABLE_TYPES) for e in v]):
-      if len(self.__dict__[k]) == len(v):
-        for i in range(len(v)):
-          self.__dict__[k][i].override(v[i])
-      elif not all([isinstance(e, self.IMMUTABLE_TYPES) for e in v]):
-        logging.warning(
-            "The list/tuple don't match the value dictionaries provided. Thus, "
-            'the list/tuple is determined by the type annotation and '
-            'values provided. This is error-prone.')
-        self.__dict__[k] = self._import_config(v, subconfig_type)
-      else:
-        self.__dict__[k] = self._import_config(v, subconfig_type)
-    else:
-      self.__dict__[k] = self._import_config(v, subconfig_type)
-
-  def __setattr__(self, k, v):
-    if k not in self.RESERVED_ATTR:
-      if getattr(self, '_locked', False):
-        raise ValueError('The Config has been locked. ' 'No change is allowed.')
-    self._set(k, v)
-
-  def _override(self, override_dict, is_strict=True):
-    """Overrides same method in ParamsDict.
-
-    Also called by ParamsDict methods.
-
-    Args:
-      override_dict: dictionary to write to .
-      is_strict: If True, not allows to add new keys.
-
-    Raises:
-      KeyError: overriding reserved keys or keys not exist (is_strict=True).
-    """
-    for k, v in sorted(override_dict.items()):
-      if k in self.RESERVED_ATTR:
-        raise KeyError('The key {!r} is internally reserved. '
-                       'Can not be overridden.'.format(k))
-      if k not in self.__dict__:
-        if is_strict:
-          raise KeyError('The key {!r} does not exist in {!r}. '
-                         'To extend the existing keys, use '
-                         '`override` with `is_strict` = False.'.format(
-                             k, type(self)))
-        else:
-          self._set(k, v)
-      else:
-        if isinstance(v, dict) and self.__dict__[k]:
-          self.__dict__[k]._override(v, is_strict)  # pylint: disable=protected-access
-        elif isinstance(v, params_dict.ParamsDict) and self.__dict__[k]:
-          self.__dict__[k]._override(v.as_dict(), is_strict)  # pylint: disable=protected-access
-        else:
-          self._set(k, v)
-
-  def as_dict(self):
-    """Returns a dict representation of params_dict.ParamsDict.
-
-    For the nested params_dict.ParamsDict, a nested dict will be returned.
-    """
-    return {
-        k: self._export_config(v)
-        for k, v in self.__dict__.items()
-        if k not in self.RESERVED_ATTR
-    }
-
-  def replace(self, **kwargs):
-    """Overrides/returns a unlocked copy with the current config unchanged."""
-    # pylint: disable=protected-access
-    params = copy.deepcopy(self)
-    params._locked = False
-    params._override(kwargs, is_strict=True)
-    # pylint: enable=protected-access
-    return params
-
-  @classmethod
-  def from_yaml(cls, file_path: str):
-    # Note: This only works if the Config has all default values.
-    with tf.io.gfile.GFile(file_path, 'r') as f:
-      loaded = yaml.load(f, Loader=yaml.FullLoader)
-      config = cls()
-      config.override(loaded)
-      return config
-
-  @classmethod
-  def from_json(cls, file_path: str):
-    """Wrapper for `from_yaml`."""
-    return cls.from_yaml(file_path)
-
-  @classmethod
-  def from_args(cls, *args, **kwargs):
-    """Builds a config from the given list of arguments."""
-    attributes = list(cls.__annotations__.keys())
-    default_params = {a: p for a, p in zip(attributes, args)}
-    default_params.update(kwargs)
-    return cls(default_params)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/base_config_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/base_config_test.py
deleted file mode 100644
index 3ae5b3081eaf882ab7c03c8d5ed0482bac4dc2c7..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/base_config_test.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import pprint
-from typing import List, Tuple
-
-from absl.testing import parameterized
-import dataclasses
-import tensorflow as tf
-from official.modeling.hyperparams import base_config
-
-
-@dataclasses.dataclass
-class DumpConfig1(base_config.Config):
-  a: int = 1
-  b: str = 'text'
-
-
-@dataclasses.dataclass
-class DumpConfig2(base_config.Config):
-  c: int = 2
-  d: str = 'text'
-  e: DumpConfig1 = DumpConfig1()
-
-
-@dataclasses.dataclass
-class DumpConfig3(DumpConfig2):
-  f: int = 2
-  g: str = 'text'
-  h: List[DumpConfig1] = dataclasses.field(
-      default_factory=lambda: [DumpConfig1(), DumpConfig1()])
-  g: Tuple[DumpConfig1, ...] = (DumpConfig1(),)
-
-
-@dataclasses.dataclass
-class DumpConfig4(DumpConfig2):
-  x: int = 3
-
-
-@dataclasses.dataclass
-class DummyConfig5(base_config.Config):
-  y: Tuple[DumpConfig2, ...] = (DumpConfig2(), DumpConfig4())
-  z: Tuple[str] = ('a',)
-
-
-class BaseConfigTest(parameterized.TestCase, tf.test.TestCase):
-
-  def assertHasSameTypes(self, c, d, msg=''):
-    """Checks if a Config has the same structure as a given dict.
-
-    Args:
-      c: the Config object to be check.
-      d: the reference dict object.
-      msg: The error message to show when type mismatched.
-    """
-    # Make sure d is not a Config. Assume d is either
-    # dictionary or primitive type and c is the Config or primitive types.
-    self.assertNotIsInstance(d, base_config.Config)
-    if isinstance(d, base_config.Config.IMMUTABLE_TYPES):
-      self.assertEqual(pprint.pformat(c), pprint.pformat(d), msg=msg)
-    elif isinstance(d, base_config.Config.SEQUENCE_TYPES):
-      self.assertEqual(type(c), type(d), msg=msg)
-      for i, v in enumerate(d):
-        self.assertHasSameTypes(c[i], v, msg='{}[{!r}]'.format(msg, i))
-    elif isinstance(d, dict):
-      self.assertIsInstance(c, base_config.Config, msg=msg)
-      for k, v in sorted(d.items()):
-        self.assertHasSameTypes(getattr(c, k), v, msg='{}[{!r}]'.format(msg, k))
-    else:
-      raise TypeError('Unknown type: %r' % type(d))
-
-  def assertImportExport(self, v):
-    config = base_config.Config({'key': v})
-    back = config.as_dict()['key']
-    self.assertEqual(pprint.pformat(back), pprint.pformat(v))
-    self.assertHasSameTypes(config.key, v, msg='=%s v' % pprint.pformat(v))
-
-  def test_invalid_keys(self):
-    params = base_config.Config()
-    with self.assertRaises(AttributeError):
-      _ = params.a
-
-  def test_nested_config_types(self):
-    config = DumpConfig3()
-    self.assertIsInstance(config.e, DumpConfig1)
-    self.assertIsInstance(config.h[0], DumpConfig1)
-    self.assertIsInstance(config.h[1], DumpConfig1)
-    self.assertIsInstance(config.g[0], DumpConfig1)
-
-    config.override({'e': {'a': 2, 'b': 'new text'}})
-    self.assertIsInstance(config.e, DumpConfig1)
-    self.assertEqual(config.e.a, 2)
-    self.assertEqual(config.e.b, 'new text')
-
-    config.override({'h': [{'a': 3, 'b': 'new text 2'}]})
-    self.assertIsInstance(config.h[0], DumpConfig1)
-    self.assertLen(config.h, 1)
-    self.assertEqual(config.h[0].a, 3)
-    self.assertEqual(config.h[0].b, 'new text 2')
-
-    config.override({'g': [{'a': 4, 'b': 'new text 3'}]})
-    self.assertIsInstance(config.g[0], DumpConfig1)
-    self.assertLen(config.g, 1)
-    self.assertEqual(config.g[0].a, 4)
-    self.assertEqual(config.g[0].b, 'new text 3')
-
-  def test_replace(self):
-    config = DumpConfig2()
-    new_config = config.replace(e={'a': 2})
-    self.assertEqual(new_config.e.a, 2)
-    self.assertIsInstance(new_config.e, DumpConfig1)
-
-    config = DumpConfig2(e=DumpConfig2())
-    new_config = config.replace(e={'c': 4})
-    self.assertEqual(new_config.e.c, 4)
-    self.assertIsInstance(new_config.e, DumpConfig2)
-
-    config = DumpConfig3()
-    new_config = config.replace(g=[{'a': 4, 'b': 'new text 3'}])
-    self.assertIsInstance(new_config.g[0], DumpConfig1)
-    self.assertEqual(new_config.g[0].a, 4)
-
-  @parameterized.parameters(
-      ('_locked', "The key '_locked' is internally reserved."),
-      ('_restrictions', "The key '_restrictions' is internally reserved."),
-      ('aa', "The key 'aa' does not exist."),
-  )
-  def test_key_error(self, key, msg):
-    params = base_config.Config()
-    with self.assertRaisesRegex(KeyError, msg):
-      params.override({key: True})
-
-  @parameterized.parameters(
-      ('str data',),
-      (123,),
-      (1.23,),
-      (None,),
-      (['str', 1, 2.3, None],),
-      (('str', 1, 2.3, None),),
-  )
-  def test_import_export_immutable_types(self, v):
-    self.assertImportExport(v)
-    out = base_config.Config({'key': v})
-    self.assertEqual(pprint.pformat(v), pprint.pformat(out.key))
-
-  def test_override_is_strict_true(self):
-    params = base_config.Config({
-        'a': 'aa',
-        'b': 2,
-        'c': {
-            'c1': 'cc',
-            'c2': 20
-        }
-    })
-    params.override({'a': 2, 'c': {'c1': 'ccc'}}, is_strict=True)
-    self.assertEqual(params.a, 2)
-    self.assertEqual(params.c.c1, 'ccc')
-    with self.assertRaises(KeyError):
-      params.override({'d': 'ddd'}, is_strict=True)
-    with self.assertRaises(KeyError):
-      params.override({'c': {'c3': 30}}, is_strict=True)
-
-    config = base_config.Config({'key': [{'a': 42}]})
-    with self.assertRaisesRegex(KeyError, "The key 'b' does not exist"):
-      config.override({'key': [{'b': 43}]})
-
-  @parameterized.parameters(
-      (lambda x: x, 'Unknown type'),
-      (object(), 'Unknown type'),
-      (set(), 'Unknown type'),
-      (frozenset(), 'Unknown type'),
-  )
-  def test_import_unsupport_types(self, v, msg):
-    with self.assertRaisesRegex(TypeError, msg):
-      _ = base_config.Config({'key': v})
-
-  @parameterized.parameters(
-      ({
-          'a': [{
-              'b': 2,
-          }, {
-              'c': 3,
-          }]
-      },),
-      ({
-          'c': [{
-              'f': 1.1,
-          }, {
-              'h': [1, 2],
-          }]
-      },),
-      (({
-          'a': 'aa',
-          'b': 2,
-          'c': {
-              'c1': 10,
-              'c2': 20,
-          }
-      },),),
-  )
-  def test_import_export_nested_structure(self, d):
-    self.assertImportExport(d)
-
-  @parameterized.parameters(
-      ([{
-          'a': 42,
-          'b': 'hello',
-          'c': 1.2
-      }],),
-      (({
-          'a': 42,
-          'b': 'hello',
-          'c': 1.2
-      },),),
-  )
-  def test_import_export_nested_sequences(self, v):
-    self.assertImportExport(v)
-
-  @parameterized.parameters(
-      ([([{}],)],),
-      ([['str', 1, 2.3, None]],),
-      ((('str', 1, 2.3, None),),),
-      ([
-          ('str', 1, 2.3, None),
-      ],),
-      ([
-          ('str', 1, 2.3, None),
-      ],),
-      ([[{
-          'a': 42,
-          'b': 'hello',
-          'c': 1.2
-      }]],),
-      ([[[{
-          'a': 42,
-          'b': 'hello',
-          'c': 1.2
-      }]]],),
-      ((({
-          'a': 42,
-          'b': 'hello',
-          'c': 1.2
-      },),),),
-      (((({
-          'a': 42,
-          'b': 'hello',
-          'c': 1.2
-      },),),),),
-      ([({
-          'a': 42,
-          'b': 'hello',
-          'c': 1.2
-      },)],),
-      (([{
-          'a': 42,
-          'b': 'hello',
-          'c': 1.2
-      }],),),
-  )
-  def test_import_export_unsupport_sequence(self, v):
-    with self.assertRaisesRegex(TypeError,
-                                'Invalid sequence: only supports single level'):
-      _ = base_config.Config({'key': v})
-
-  def test_construct_subtype(self):
-    pass
-
-  def test_import_config(self):
-    params = base_config.Config({'a': [{'b': 2}, {'c': {'d': 3}}]})
-    self.assertLen(params.a, 2)
-    self.assertEqual(params.a[0].b, 2)
-    self.assertEqual(type(params.a[0]), base_config.Config)
-    self.assertEqual(pprint.pformat(params.a[0].b), '2')
-    self.assertEqual(type(params.a[1]), base_config.Config)
-    self.assertEqual(type(params.a[1].c), base_config.Config)
-    self.assertEqual(pprint.pformat(params.a[1].c.d), '3')
-
-  def test_override(self):
-    params = base_config.Config({'a': [{'b': 2}, {'c': {'d': 3}}]})
-    params.override({'a': [{'b': 4}, {'c': {'d': 5}}]}, is_strict=False)
-    self.assertEqual(type(params.a), list)
-    self.assertEqual(type(params.a[0]), base_config.Config)
-    self.assertEqual(pprint.pformat(params.a[0].b), '4')
-    self.assertEqual(type(params.a[1]), base_config.Config)
-    self.assertEqual(type(params.a[1].c), base_config.Config)
-    self.assertEqual(pprint.pformat(params.a[1].c.d), '5')
-
-  @parameterized.parameters(
-      ([{}],),
-      (({},),),
-  )
-  def test_config_vs_params_dict(self, v):
-    d = {'key': v}
-    self.assertEqual(type(base_config.Config(d).key[0]), base_config.Config)
-    self.assertEqual(type(base_config.params_dict.ParamsDict(d).key[0]), dict)
-
-  def test_ppformat(self):
-    self.assertEqual(
-        pprint.pformat([
-            's', 1, 1.0, True, None, {}, [], (), {
-                (2,): (3, [4], {
-                    6: 7,
-                }),
-                8: 9,
-            }
-        ]),
-        "['s', 1, 1.0, True, None, {}, [], (), {8: 9, (2,): (3, [4], {6: 7})}]")
-
-  def test_with_restrictions(self):
-    restrictions = ['e.a<c']
-    config = DumpConfig2(restrictions=restrictions)
-    config.validate()
-
-  def test_nested_tuple(self):
-    config = DummyConfig5()
-    config.override({
-        'y': [{
-            'c': 4,
-            'd': 'new text 3',
-            'e': {
-                'a': 2
-            }
-        }, {
-            'c': 0,
-            'd': 'new text 3',
-            'e': {
-                'a': 2
-            }
-        }],
-        'z': ['a', 'b', 'c'],
-    })
-    self.assertEqual(config.y[0].c, 4)
-    self.assertEqual(config.y[1].c, 0)
-    self.assertIsInstance(config.y[0], DumpConfig2)
-    self.assertIsInstance(config.y[1], DumpConfig4)
-    self.assertSameElements(config.z, ['a', 'b', 'c'])
-
-  def test_override_by_empty_sequence(self):
-    config = DummyConfig5()
-    config.override({
-        'y': [],
-        'z': (),
-    }, is_strict=True)
-    self.assertEmpty(config.y)
-    self.assertEmpty(config.z)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/config_definitions.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/config_definitions.py
deleted file mode 100644
index 7528019c6843f184e5098c46d0c88030920ed4c5..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/config_definitions.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Common configuration settings."""
-# pylint:disable=wildcard-import
-import dataclasses
-
-from official.core.config_definitions import *
-from official.modeling.hyperparams import base_config
-
-
-# TODO(hongkuny): These configs are used in models that are going to deprecate.
-# Once those models are removed, we should delete this file to avoid confusion.
-# Users should not use this file anymore.
-@dataclasses.dataclass
-class TensorboardConfig(base_config.Config):
-  """Configuration for Tensorboard.
-
-  Attributes:
-    track_lr: Whether or not to track the learning rate in Tensorboard. Defaults
-      to True.
-    write_model_weights: Whether or not to write the model weights as images in
-      Tensorboard. Defaults to False.
-  """
-  track_lr: bool = True
-  write_model_weights: bool = False
-
-
-@dataclasses.dataclass
-class CallbacksConfig(base_config.Config):
-  """Configuration for Callbacks.
-
-  Attributes:
-    enable_checkpoint_and_export: Whether or not to enable checkpoints as a
-      Callback. Defaults to True.
-    enable_backup_and_restore: Whether or not to add BackupAndRestore
-      callback. Defaults to True.
-    enable_tensorboard: Whether or not to enable Tensorboard as a Callback.
-      Defaults to True.
-    enable_time_history: Whether or not to enable TimeHistory Callbacks.
-      Defaults to True.
-  """
-  enable_checkpoint_and_export: bool = True
-  enable_backup_and_restore: bool = False
-  enable_tensorboard: bool = True
-  enable_time_history: bool = True
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/oneof.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/oneof.py
deleted file mode 100644
index e3cf69670efefd1c23b43e8ea83846408bce4e0e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/oneof.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Config class that supports oneof functionality."""
-
-from typing import Optional
-
-import dataclasses
-from official.modeling.hyperparams import base_config
-
-
-@dataclasses.dataclass
-class OneOfConfig(base_config.Config):
-  """Configuration for configs with one of feature.
-
-  Attributes:
-    type: 'str', name of the field to select.
-  """
-  type: Optional[str] = None
-
-  def as_dict(self):
-    """Returns a dict representation of OneOfConfig.
-
-    For the nested base_config.Config, a nested dict will be returned.
-    """
-    if self.type is None:
-      return {'type': None}
-    elif self.__dict__['type'] not in self.__dict__:
-      raise ValueError('type: {!r} is not a valid key!'.format(
-          self.__dict__['type']))
-    else:
-      chosen_type = self.type
-      chosen_value = self.__dict__[chosen_type]
-      return {'type': self.type, chosen_type: self._export_config(chosen_value)}
-
-  def get(self):
-    """Returns selected config based on the value of type.
-
-    If type is not set (None), None is returned.
-    """
-    chosen_type = self.type
-    if chosen_type is None:
-      return None
-    if chosen_type not in self.__dict__:
-      raise ValueError('type: {!r} is not a valid key!'.format(self.type))
-    return self.__dict__[chosen_type]
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/oneof_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/oneof_test.py
deleted file mode 100644
index 66fd1bc8c4dc3b7356bd37fc2e80dd00624b8b2b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/oneof_test.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import dataclasses
-import tensorflow as tf
-from official.modeling.hyperparams import base_config
-from official.modeling.hyperparams import oneof
-
-
-@dataclasses.dataclass
-class ResNet(base_config.Config):
-  model_depth: int = 50
-
-
-@dataclasses.dataclass
-class Backbone(oneof.OneOfConfig):
-  type: str = 'resnet'
-  resnet: ResNet = ResNet()
-  not_resnet: int = 2
-
-
-@dataclasses.dataclass
-class OutputLayer(oneof.OneOfConfig):
-  type: str = 'single'
-  single: int = 1
-  multi_head: int = 2
-
-
-@dataclasses.dataclass
-class Network(base_config.Config):
-  backbone: Backbone = Backbone()
-  output_layer: OutputLayer = OutputLayer()
-
-
-class OneOfTest(tf.test.TestCase):
-
-  def test_to_dict(self):
-    network_params = {
-        'backbone': {
-            'type': 'resnet',
-            'resnet': {
-                'model_depth': 50
-            }
-        },
-        'output_layer': {
-            'type': 'single',
-            'single': 1000
-        }
-    }
-    network_config = Network(network_params)
-    self.assertEqual(network_config.as_dict(), network_params)
-
-  def test_get_oneof(self):
-    backbone = Backbone()
-    self.assertIsInstance(backbone.get(), ResNet)
-    self.assertEqual(backbone.get().as_dict(), {'model_depth': 50})
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/params_dict.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/params_dict.py
deleted file mode 100644
index cfa98677deb8b11916b9e2331ea3c1f889cccb51..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/params_dict.py
+++ /dev/null
@@ -1,480 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A parameter dictionary class which supports the nest structure."""
-
-import collections
-import copy
-import re
-
-import six
-import tensorflow as tf
-import yaml
-
-# regex pattern that matches on key-value pairs in a comma-separated
-# key-value pair string. It splits each k-v pair on the = sign, and
-# matches on values that are within single quotes, double quotes, single
-# values (e.g. floats, ints, etc.), and a lists within brackets.
-_PARAM_RE = re.compile(
-    r"""
-  (?P<name>[a-zA-Z][\w\.]*)    # variable name: "var" or "x"
-  \s*=\s*
-  ((?P<val>\'(.*?)\'           # single quote
-  |
-  \"(.*?)\"                    # double quote
-  |
-  [^,\[]*                      # single value
-  |
-  \[[^\]]*\]))                 # list of values
-  ($|,\s*)""", re.VERBOSE)
-
-_CONST_VALUE_RE = re.compile(r'(\d.*|-\d.*|None)')
-
-# Yaml loader with an implicit resolver to parse float decimal and exponential
-# format. The regular experission parse the following cases:
-# 1- Decimal number with an optional exponential term.
-# 2- Integer number with an exponential term.
-# 3- Decimal number with an optional exponential term.
-# 4- Decimal number.
-
-LOADER = yaml.SafeLoader
-LOADER.add_implicit_resolver(
-    'tag:yaml.org,2002:float',
-    re.compile(r'''
-    ^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
-    |
-    [-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
-    |
-    \\.[0-9_]+(?:[eE][-+][0-9]+)?
-    |
-    [-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*)$''', re.X),
-    list('-+0123456789.'))
-
-
-class ParamsDict(object):
-  """A hyperparameter container class."""
-
-  RESERVED_ATTR = ['_locked', '_restrictions']
-
-  def __init__(self, default_params=None, restrictions=None):
-    """Instantiate a ParamsDict.
-
-    Instantiate a ParamsDict given a set of default parameters and a list of
-    restrictions. Upon initialization, it validates itself by checking all the
-    defined restrictions, and raise error if it finds inconsistency.
-
-    Args:
-      default_params: a Python dict or another ParamsDict object including the
-        default parameters to initialize.
-      restrictions: a list of strings, which define a list of restrictions to
-        ensure the consistency of different parameters internally. Each
-        restriction string is defined as a binary relation with a set of
-        operators, including {'==', '!=',  '<', '<=', '>', '>='}.
-    """
-    self._locked = False
-    self._restrictions = []
-    if restrictions:
-      self._restrictions = restrictions
-    if default_params is None:
-      default_params = {}
-    self.override(default_params, is_strict=False)
-
-  def _set(self, k, v):
-    if isinstance(v, dict):
-      self.__dict__[k] = ParamsDict(v)
-    else:
-      self.__dict__[k] = copy.deepcopy(v)
-
-  def __setattr__(self, k, v):
-    """Sets the value of the existing key.
-
-    Note that this does not allow directly defining a new key. Use the
-    `override` method with `is_strict=False` instead.
-
-    Args:
-      k: the key string.
-      v: the value to be used to set the key `k`.
-
-    Raises:
-      KeyError: if k is not defined in the ParamsDict.
-    """
-    if k not in ParamsDict.RESERVED_ATTR:
-      if k not in self.__dict__.keys():
-        raise KeyError('The key `%{}` does not exist. '
-                       'To extend the existing keys, use '
-                       '`override` with `is_strict` = True.'.format(k))
-      if self._locked:
-        raise ValueError('The ParamsDict has been locked. '
-                         'No change is allowed.')
-    self._set(k, v)
-
-  def __getattr__(self, k):
-    """Gets the value of the existing key.
-
-    Args:
-      k: the key string.
-
-    Returns:
-      the value of the key.
-
-    Raises:
-      AttributeError: if k is not defined in the ParamsDict.
-    """
-    if k not in self.__dict__.keys():
-      raise AttributeError('The key `{}` does not exist. '.format(k))
-    return self.__dict__[k]
-
-  def __contains__(self, key):
-    """Implements the membership test operator."""
-    return key in self.__dict__
-
-  def get(self, key, value=None):
-    """Accesses through built-in dictionary get method."""
-    return self.__dict__.get(key, value)
-
-  def __delattr__(self, k):
-    """Deletes the key and removes its values.
-
-    Args:
-      k: the key string.
-
-    Raises:
-      AttributeError: if k is reserverd or not defined in the ParamsDict.
-      ValueError: if the ParamsDict instance has been locked.
-    """
-    if k in ParamsDict.RESERVED_ATTR:
-      raise AttributeError(
-          'The key `{}` is reserved. No change is allowes. '.format(k))
-    if k not in self.__dict__.keys():
-      raise AttributeError('The key `{}` does not exist. '.format(k))
-    if self._locked:
-      raise ValueError('The ParamsDict has been locked. No change is allowed.')
-    del self.__dict__[k]
-
-  def override(self, override_params, is_strict=True):
-    """Override the ParamsDict with a set of given params.
-
-    Args:
-      override_params: a dict or a ParamsDict specifying the parameters to be
-        overridden.
-      is_strict: a boolean specifying whether override is strict or not. If
-        True, keys in `override_params` must be present in the ParamsDict. If
-        False, keys in `override_params` can be different from what is currently
-        defined in the ParamsDict. In this case, the ParamsDict will be extended
-        to include the new keys.
-    """
-    if self._locked:
-      raise ValueError('The ParamsDict has been locked. No change is allowed.')
-    if isinstance(override_params, ParamsDict):
-      override_params = override_params.as_dict()
-    self._override(override_params, is_strict)  # pylint: disable=protected-access
-
-  def _override(self, override_dict, is_strict=True):
-    """The implementation of `override`."""
-    for k, v in six.iteritems(override_dict):
-      if k in ParamsDict.RESERVED_ATTR:
-        raise KeyError('The key `%{}` is internally reserved. '
-                       'Can not be overridden.')
-      if k not in self.__dict__.keys():
-        if is_strict:
-          raise KeyError('The key `{}` does not exist. '
-                         'To extend the existing keys, use '
-                         '`override` with `is_strict` = False.'.format(k))
-        else:
-          self._set(k, v)
-      else:
-        if isinstance(v, dict):
-          self.__dict__[k]._override(v, is_strict)  # pylint: disable=protected-access
-        elif isinstance(v, ParamsDict):
-          self.__dict__[k]._override(v.as_dict(), is_strict)  # pylint: disable=protected-access
-        else:
-          self.__dict__[k] = copy.deepcopy(v)
-
-  def lock(self):
-    """Makes the ParamsDict immutable."""
-    self._locked = True
-
-  def as_dict(self):
-    """Returns a dict representation of ParamsDict.
-
-    For the nested ParamsDict, a nested dict will be returned.
-    """
-    params_dict = {}
-    for k, v in six.iteritems(self.__dict__):
-      if k not in ParamsDict.RESERVED_ATTR:
-        if isinstance(v, ParamsDict):
-          params_dict[k] = v.as_dict()
-        else:
-          params_dict[k] = copy.deepcopy(v)
-    return params_dict
-
-  def validate(self):
-    """Validate the parameters consistency based on the restrictions.
-
-    This method validates the internal consistency using the pre-defined list of
-    restrictions. A restriction is defined as a string which specfiies a binary
-    operation. The supported binary operations are {'==', '!=', '<', '<=', '>',
-    '>='}. Note that the meaning of these operators are consistent with the
-    underlying Python immplementation. Users should make sure the define
-    restrictions on their type make sense.
-
-    For example, for a ParamsDict like the following
-    ```
-    a:
-      a1: 1
-      a2: 2
-    b:
-      bb:
-        bb1: 10
-        bb2: 20
-      ccc:
-        a1: 1
-        a3: 3
-    ```
-    one can define two restrictions like this
-    ['a.a1 == b.ccc.a1', 'a.a2 <= b.bb.bb2']
-
-    What it enforces are:
-     - a.a1 = 1 == b.ccc.a1 = 1
-     - a.a2 = 2 <= b.bb.bb2 = 20
-
-    Raises:
-      KeyError: if any of the following happens
-        (1) any of parameters in any of restrictions is not defined in
-            ParamsDict,
-        (2) any inconsistency violating the restriction is found.
-      ValueError: if the restriction defined in the string is not supported.
-    """
-
-    def _get_kv(dotted_string, params_dict):
-      """Get keys and values indicated by dotted_string."""
-      if _CONST_VALUE_RE.match(dotted_string) is not None:
-        const_str = dotted_string
-        if const_str == 'None':
-          constant = None
-        else:
-          constant = float(const_str)
-        return None, constant
-      else:
-        tokenized_params = dotted_string.split('.')
-        v = params_dict
-        for t in tokenized_params:
-          v = v[t]
-        return tokenized_params[-1], v
-
-    def _get_kvs(tokens, params_dict):
-      if len(tokens) != 2:
-        raise ValueError('Only support binary relation in restriction.')
-      stripped_tokens = [t.strip() for t in tokens]
-      left_k, left_v = _get_kv(stripped_tokens[0], params_dict)
-      right_k, right_v = _get_kv(stripped_tokens[1], params_dict)
-      return left_k, left_v, right_k, right_v
-
-    params_dict = self.as_dict()
-    for restriction in self._restrictions:
-      if '==' in restriction:
-        tokens = restriction.split('==')
-        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
-        if left_v != right_v:
-          raise KeyError(
-              'Found inconsistncy between key `{}` and key `{}`.'.format(
-                  tokens[0], tokens[1]))
-      elif '!=' in restriction:
-        tokens = restriction.split('!=')
-        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
-        if left_v == right_v:
-          raise KeyError(
-              'Found inconsistncy between key `{}` and key `{}`.'.format(
-                  tokens[0], tokens[1]))
-      elif '<' in restriction:
-        tokens = restriction.split('<')
-        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
-        if left_v >= right_v:
-          raise KeyError(
-              'Found inconsistncy between key `{}` and key `{}`.'.format(
-                  tokens[0], tokens[1]))
-      elif '<=' in restriction:
-        tokens = restriction.split('<=')
-        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
-        if left_v > right_v:
-          raise KeyError(
-              'Found inconsistncy between key `{}` and key `{}`.'.format(
-                  tokens[0], tokens[1]))
-      elif '>' in restriction:
-        tokens = restriction.split('>')
-        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
-        if left_v <= right_v:
-          raise KeyError(
-              'Found inconsistncy between key `{}` and key `{}`.'.format(
-                  tokens[0], tokens[1]))
-      elif '>=' in restriction:
-        tokens = restriction.split('>=')
-        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
-        if left_v < right_v:
-          raise KeyError(
-              'Found inconsistncy between key `{}` and key `{}`.'.format(
-                  tokens[0], tokens[1]))
-      else:
-        raise ValueError('Unsupported relation in restriction.')
-
-
-def read_yaml_to_params_dict(file_path: str):
-  """Reads a YAML file to a ParamsDict."""
-  with tf.io.gfile.GFile(file_path, 'r') as f:
-    params_dict = yaml.load(f, Loader=LOADER)
-    return ParamsDict(params_dict)
-
-
-def save_params_dict_to_yaml(params, file_path):
-  """Saves the input ParamsDict to a YAML file."""
-  with tf.io.gfile.GFile(file_path, 'w') as f:
-
-    def _my_list_rep(dumper, data):
-      # u'tag:yaml.org,2002:seq' is the YAML internal tag for sequence.
-      return dumper.represent_sequence(
-          u'tag:yaml.org,2002:seq', data, flow_style=True)
-
-    yaml.add_representer(list, _my_list_rep)
-    yaml.dump(params.as_dict(), f, default_flow_style=False)
-
-
-def nested_csv_str_to_json_str(csv_str):
-  """Converts a nested (using '.') comma-separated k=v string to a JSON string.
-
-  Converts a comma-separated string of key/value pairs that supports
-  nesting of keys to a JSON string. Nesting is implemented using
-  '.' between levels for a given key.
-
-  Spacing between commas and = is supported (e.g. there is no difference between
-  "a=1,b=2", "a = 1, b = 2", or "a=1, b=2") but there should be no spaces before
-  keys or after values (e.g. " a=1,b=2" and "a=1,b=2 " are not supported).
-
-  Note that this will only support values supported by CSV, meaning
-  values such as nested lists (e.g. "a=[[1,2,3],[4,5,6]]") are not
-  supported. Strings are supported as well, e.g. "a='hello'".
-
-  An example conversion would be:
-
-  "a=1, b=2, c.a=2, c.b=3, d.a.a=5"
-
-  to
-
-  "{ a: 1, b : 2, c: {a : 2, b : 3}, d: {a: {a : 5}}}"
-
-  Args:
-    csv_str: the comma separated string.
-
-  Returns:
-    the converted JSON string.
-
-  Raises:
-    ValueError: If csv_str is not in a comma separated string or
-      if the string is formatted incorrectly.
-  """
-  if not csv_str:
-    return ''
-
-  formatted_entries = []
-  nested_map = collections.defaultdict(list)
-  pos = 0
-  while pos < len(csv_str):
-    m = _PARAM_RE.match(csv_str, pos)
-    if not m:
-      raise ValueError('Malformed hyperparameter value while parsing '
-                       'CSV string: %s' % csv_str[pos:])
-    pos = m.end()
-    # Parse the values.
-    m_dict = m.groupdict()
-    name = m_dict['name']
-    v = m_dict['val']
-
-    # If a GCS path (e.g. gs://...) is provided, wrap this in quotes
-    # as yaml.load would otherwise throw an exception
-    if re.match(r'(?=[^\"\'])(?=[gs://])', v):
-      v = '\'{}\''.format(v)
-
-    name_nested = name.split('.')
-    if len(name_nested) > 1:
-      grouping = name_nested[0]
-      value = '.'.join(name_nested[1:]) + '=' + v
-      nested_map[grouping].append(value)
-    else:
-      formatted_entries.append('%s : %s' % (name, v))
-
-  for grouping, value in nested_map.items():
-    value = ','.join(value)
-    value = nested_csv_str_to_json_str(value)
-    formatted_entries.append('%s : %s' % (grouping, value))
-  return '{' + ', '.join(formatted_entries) + '}'
-
-
-def override_params_dict(params, dict_or_string_or_yaml_file, is_strict):
-  """Override a given ParamsDict using a dict, JSON/YAML/CSV string or YAML file.
-
-  The logic of the function is outlined below:
-  1. Test that the input is a dict. If not, proceed to 2.
-  2. Tests that the input is a string. If not, raise unknown ValueError
-  2.1. Test if the string is in a CSV format. If so, parse.
-  If not, proceed to 2.2.
-  2.2. Try loading the string as a YAML/JSON. If successful, parse to
-  dict and use it to override. If not, proceed to 2.3.
-  2.3. Try using the string as a file path and load the YAML file.
-
-  Args:
-    params: a ParamsDict object to be overridden.
-    dict_or_string_or_yaml_file: a Python dict, JSON/YAML/CSV string or path to
-      a YAML file specifying the parameters to be overridden.
-    is_strict: a boolean specifying whether override is strict or not.
-
-  Returns:
-    params: the overridden ParamsDict object.
-
-  Raises:
-    ValueError: if failed to override the parameters.
-  """
-  if not dict_or_string_or_yaml_file:
-    return params
-  if isinstance(dict_or_string_or_yaml_file, dict):
-    params.override(dict_or_string_or_yaml_file, is_strict)
-  elif isinstance(dict_or_string_or_yaml_file, six.string_types):
-    try:
-      dict_or_string_or_yaml_file = (
-          nested_csv_str_to_json_str(dict_or_string_or_yaml_file))
-    except ValueError:
-      pass
-    params_dict = yaml.load(dict_or_string_or_yaml_file, Loader=LOADER)
-    if isinstance(params_dict, dict):
-      params.override(params_dict, is_strict)
-    else:
-      with tf.io.gfile.GFile(dict_or_string_or_yaml_file) as f:
-        params.override(yaml.load(f, Loader=yaml.FullLoader), is_strict)
-  else:
-    raise ValueError('Unknown input type to parse.')
-  return params
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/params_dict_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/params_dict_test.py
deleted file mode 100644
index 528f8df039c4f552e0c68ccf1527cc5f9abfaf6d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/hyperparams/params_dict_test.py
+++ /dev/null
@@ -1,445 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for params_dict.py."""
-
-import os
-
-import tensorflow as tf
-import yaml
-
-from official.modeling.hyperparams import params_dict
-
-
-class ParamsDictTest(tf.test.TestCase):
-
-  def test_init_from_an_empty_dict(self):
-    params = params_dict.ParamsDict()
-    with self.assertRaises(AttributeError):
-      _ = params.a
-
-    with self.assertRaises(KeyError):
-      params.a = 'aa'
-
-  def test_init_from_a_dict(self):
-    params = params_dict.ParamsDict({'a': 'aa', 'b': 2})
-    self.assertEqual(params.a, 'aa')
-    self.assertEqual(params.b, 2)
-
-  def test_init_from_a_param_dict(self):
-    params_init = params_dict.ParamsDict({'a': 'aa', 'b': 2})
-    params = params_dict.ParamsDict(params_init)
-    self.assertEqual(params.a, 'aa')
-    self.assertEqual(params.b, 2)
-
-  def test_lock(self):
-    params = params_dict.ParamsDict({'a': 1, 'b': 2, 'c': 3})
-    params.lock()
-    with self.assertRaises(ValueError):
-      params.a = 10
-    with self.assertRaises(ValueError):
-      params.override({'b': 20})
-    with self.assertRaises(ValueError):
-      del params.c
-
-  def test_setattr(self):
-    params = params_dict.ParamsDict()
-    params.override({'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
-    params.c = 'ccc'
-    self.assertEqual(params.a, 'aa')
-    self.assertEqual(params.b, 2)
-    self.assertEqual(params.c, 'ccc')
-
-  def test_getattr(self):
-    params = params_dict.ParamsDict()
-    params.override({'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
-    self.assertEqual(params.a, 'aa')
-    self.assertEqual(params.b, 2)
-    self.assertEqual(params.c, None)
-
-  def test_delattr(self):
-    params = params_dict.ParamsDict()
-    params.override({
-        'a': 'aa',
-        'b': 2,
-        'c': None,
-        'd': {
-            'd1': 1,
-            'd2': 10
-        }
-    },
-                    is_strict=False)
-    del params.c
-    self.assertEqual(params.a, 'aa')
-    self.assertEqual(params.b, 2)
-    with self.assertRaises(AttributeError):
-      _ = params.c
-    del params.d
-    with self.assertRaises(AttributeError):
-      _ = params.d.d1
-
-  def test_contains(self):
-    params = params_dict.ParamsDict()
-    params.override({'a': 'aa'}, is_strict=False)
-    self.assertIn('a', params)
-    self.assertNotIn('b', params)
-
-  def test_get(self):
-    params = params_dict.ParamsDict()
-    params.override({'a': 'aa'}, is_strict=False)
-    self.assertEqual(params.get('a'), 'aa')
-    self.assertEqual(params.get('b', 2), 2)
-    self.assertEqual(params.get('b'), None)
-
-  def test_override_is_strict_true(self):
-    params = params_dict.ParamsDict({
-        'a': 'aa',
-        'b': 2,
-        'c': {
-            'c1': 'cc',
-            'c2': 20
-        }
-    })
-    params.override({'a': 2, 'c': {'c1': 'ccc'}}, is_strict=True)
-    self.assertEqual(params.a, 2)
-    self.assertEqual(params.c.c1, 'ccc')
-    with self.assertRaises(KeyError):
-      params.override({'d': 'ddd'}, is_strict=True)
-    with self.assertRaises(KeyError):
-      params.override({'c': {'c3': 30}}, is_strict=True)
-
-  def test_override_is_strict_false(self):
-    params = params_dict.ParamsDict({
-        'a': 'aa',
-        'b': 2,
-        'c': {
-            'c1': 10,
-            'c2': 20
-        }
-    })
-    params.override({'a': 2, 'c': {'c3': 3000}}, is_strict=False)
-    self.assertEqual(params.a, 2)
-    self.assertEqual(params.c.c3, 3000)
-    params.override({'d': 'ddd'}, is_strict=False)
-    self.assertEqual(params.d, 'ddd')
-    params.override({'c': {'c4': 4444}}, is_strict=False)
-    self.assertEqual(params.c.c4, 4444)
-
-  def test_as_dict(self):
-    params = params_dict.ParamsDict({
-        'a': 'aa',
-        'b': 2,
-        'c': {
-            'c1': 10,
-            'c2': 20
-        }
-    })
-    params_d = params.as_dict()
-    self.assertEqual(params_d['a'], 'aa')
-    self.assertEqual(params_d['b'], 2)
-    self.assertEqual(params_d['c']['c1'], 10)
-    self.assertEqual(params_d['c']['c2'], 20)
-
-  def test_validate(self):
-    # Raise error due to the unknown parameter.
-    with self.assertRaises(KeyError):
-      params = params_dict.ParamsDict({'a': 1, 'b': {'a': 11}}, ['a == c'])
-      params.validate()
-
-    # OK to check equality of two nested dicts.
-    params = params_dict.ParamsDict({
-        'a': 1,
-        'b': {
-            'a': 10
-        },
-        'c': {
-            'a': 10
-        }
-    }, ['b == c'])
-
-    # Raise error due to inconsistency
-    with self.assertRaises(KeyError):
-      params = params_dict.ParamsDict({'a': 1, 'c': {'a': 10}}, ['a == c.a'])
-      params.validate()
-
-    # Valid rule.
-    params = params_dict.ParamsDict({'a': 1, 'c': {'a': 1}}, ['a == c.a'])
-
-    # Overridding violates the existing rule, raise error upon validate.
-    params.override({'a': 11})
-    with self.assertRaises(KeyError):
-      params.validate()
-
-    # Valid restrictions with constant.
-    params = params_dict.ParamsDict({
-        'a': None,
-        'c': {
-            'a': 1
-        }
-    }, ['a == None', 'c.a == 1'])
-    params.validate()
-    with self.assertRaises(KeyError):
-      params = params_dict.ParamsDict({
-          'a': 4,
-          'c': {
-              'a': 1
-          }
-      }, ['a == None', 'c.a == 1'])
-      params.validate()
-
-
-class ParamsDictIOTest(tf.test.TestCase):
-
-  def write_temp_file(self, filename, text):
-    temp_file = os.path.join(self.get_temp_dir(), filename)
-    with tf.io.gfile.GFile(temp_file, 'w') as writer:
-      writer.write(text)
-    return temp_file
-
-  def test_save_params_dict_to_yaml(self):
-    params = params_dict.ParamsDict({
-        'a': 'aa',
-        'b': 2,
-        'c': {
-            'c1': 10,
-            'c2': 20
-        }
-    })
-    output_yaml_file = os.path.join(self.get_temp_dir(), 'params.yaml')
-    params_dict.save_params_dict_to_yaml(params, output_yaml_file)
-
-    with tf.io.gfile.GFile(output_yaml_file, 'r') as f:
-      params_d = yaml.load(f)
-      self.assertEqual(params.a, params_d['a'])
-      self.assertEqual(params.b, params_d['b'])
-      self.assertEqual(params.c.c1, params_d['c']['c1'])
-      self.assertEqual(params.c.c2, params_d['c']['c2'])
-
-  def test_read_yaml_to_params_dict(self):
-    input_yaml_file = self.write_temp_file(
-        'params.yaml', r"""
-        a: 'aa'
-        b: 2
-        c:
-          c1: 10
-          c2: 20
-    """)
-    params = params_dict.read_yaml_to_params_dict(input_yaml_file)
-
-    self.assertEqual(params.a, 'aa')
-    self.assertEqual(params.b, 2)
-    self.assertEqual(params.c.c1, 10)
-    self.assertEqual(params.c.c2, 20)
-
-  def test_override_params_dict_using_dict(self):
-    params = params_dict.ParamsDict({
-        'a': 1,
-        'b': 2.5,
-        'c': [3, 4],
-        'd': 'hello',
-        'e': False
-    })
-    override_dict = {'b': 5.2, 'c': [30, 40]}
-    params = params_dict.override_params_dict(
-        params, override_dict, is_strict=True)
-    self.assertEqual(1, params.a)
-    self.assertEqual(5.2, params.b)
-    self.assertEqual([30, 40], params.c)
-    self.assertEqual('hello', params.d)
-    self.assertEqual(False, params.e)
-
-  def test_override_params_dict_using_yaml_string(self):
-    params = params_dict.ParamsDict({
-        'a': 1,
-        'b': 2.5,
-        'c': [3, 4],
-        'd': 'hello',
-        'e': False
-    })
-    override_yaml_string = "'b': 5.2\n'c': [30, 40]"
-    params = params_dict.override_params_dict(
-        params, override_yaml_string, is_strict=True)
-    self.assertEqual(1, params.a)
-    self.assertEqual(5.2, params.b)
-    self.assertEqual([30, 40], params.c)
-    self.assertEqual('hello', params.d)
-    self.assertEqual(False, params.e)
-
-  def test_override_params_dict_using_json_string(self):
-    params = params_dict.ParamsDict({
-        'a': 1,
-        'b': {
-            'b1': 2,
-            'b2': [2, 3],
-        },
-        'd': {
-            'd1': {
-                'd2': 'hello'
-            }
-        },
-        'e': False
-    })
-    override_json_string = "{ b: { b2: [3, 4] }, d: { d1: { d2: 'hi' } } }"
-    params = params_dict.override_params_dict(
-        params, override_json_string, is_strict=True)
-    self.assertEqual(1, params.a)
-    self.assertEqual(2, params.b.b1)
-    self.assertEqual([3, 4], params.b.b2)
-    self.assertEqual('hi', params.d.d1.d2)
-    self.assertEqual(False, params.e)
-
-  def test_override_params_dict_using_csv_string(self):
-    params = params_dict.ParamsDict({
-        'a': 1,
-        'b': {
-            'b1': 2,
-            'b2': [2, 3],
-        },
-        'd': {
-            'd1': {
-                'd2': 'hello'
-            }
-        },
-        'e': False
-    })
-    override_csv_string = "b.b2=[3,4], d.d1.d2='hi, world', e=gs://test"
-    params = params_dict.override_params_dict(
-        params, override_csv_string, is_strict=True)
-    self.assertEqual(1, params.a)
-    self.assertEqual(2, params.b.b1)
-    self.assertEqual([3, 4], params.b.b2)
-    self.assertEqual('hi, world', params.d.d1.d2)
-    self.assertEqual('gs://test', params.e)
-    # Test different float formats
-    override_csv_string = 'b.b2=-1.e-3, d.d1.d2=+0.001, e=1e+3, a=-1.5E-3'
-    params = params_dict.override_params_dict(
-        params, override_csv_string, is_strict=True)
-    self.assertEqual(-1e-3, params.b.b2)
-    self.assertEqual(0.001, params.d.d1.d2)
-    self.assertEqual(1e3, params.e)
-    self.assertEqual(-1.5e-3, params.a)
-
-  def test_override_params_dict_using_yaml_file(self):
-    params = params_dict.ParamsDict({
-        'a': 1,
-        'b': 2.5,
-        'c': [3, 4],
-        'd': 'hello',
-        'e': False
-    })
-    override_yaml_file = self.write_temp_file(
-        'params.yaml', r"""
-        b: 5.2
-        c: [30, 40]
-        """)
-    params = params_dict.override_params_dict(
-        params, override_yaml_file, is_strict=True)
-    self.assertEqual(1, params.a)
-    self.assertEqual(5.2, params.b)
-    self.assertEqual([30, 40], params.c)
-    self.assertEqual('hello', params.d)
-    self.assertEqual(False, params.e)
-
-
-class IOTest(tf.test.TestCase):
-
-  def test_basic_csv_str_to_json_str(self):
-    csv_str = 'a=1,b=2,c=3'
-    json_str = '{a : 1, b : 2, c : 3}'
-    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
-    self.assertEqual(converted_csv_str, json_str)
-
-  def test_basic_csv_str_load(self):
-    csv_str = 'a=1,b=2,c=3'
-    expected_output = {'a': 1, 'b': 2, 'c': 3}
-    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
-    converted_dict = yaml.load(converted_csv_str)
-    self.assertDictEqual(converted_dict, expected_output)
-
-  def test_basic_nested_csv_str_to_json_str(self):
-    csv_str = 'a=1,b.b1=2'
-    json_str = '{a : 1, b : {b1 : 2}}'
-    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
-    self.assertEqual(converted_csv_str, json_str)
-
-  def test_basic_nested_csv_str_load(self):
-    csv_str = 'a=1,b.b1=2,c.c1=3'
-    expected_output = {'a': 1, 'b': {'b1': 2}, 'c': {'c1': 3}}
-    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
-    converted_dict = yaml.load(converted_csv_str)
-    self.assertDictEqual(converted_dict, expected_output)
-
-  def test_complex_nested_csv_str_to_json_str(self):
-    csv_str = 'a.aa.aaa.aaaaa.a=1'
-    json_str = '{a : {aa : {aaa : {aaaaa : {a : 1}}}}}'
-    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
-    self.assertEqual(converted_csv_str, json_str)
-
-  def test_complex_nested_csv_str_load(self):
-    csv_str = 'a.aa.aaa.aaaaa.a=1,a.a=2'
-    expected_output = {'a': {'aa': {'aaa': {'aaaaa': {'a': 1}}}, 'a': 2}}
-    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
-    converted_dict = yaml.load(converted_csv_str)
-    self.assertDictEqual(converted_dict, expected_output)
-
-  def test_csv_str_load_supported_datatypes(self):
-    csv_str = 'a=1,b=2.,c=[1,2,3],d=\'hello, there\',e=\"Hi.\"'
-    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
-    converted_dict = yaml.load(converted_csv_str)
-    self.assertEqual(converted_dict['a'], 1)
-    self.assertEqual(converted_dict['b'], 2.)
-    self.assertEqual(converted_dict['c'], [1, 2, 3])
-    self.assertEqual(converted_dict['d'], 'hello, there')
-    self.assertEqual(converted_dict['e'], 'Hi.')
-
-  def test_csv_str_load_unsupported_datatypes(self):
-    csv_str = 'a=[[1,2,3],[4,5,6]]'
-    self.assertRaises(ValueError, params_dict.nested_csv_str_to_json_str,
-                      csv_str)
-
-  def test_csv_str_to_json_str_spacing(self):
-    csv_str1 = 'a=1,b=2,c=3'
-    csv_str2 = 'a = 1, b = 2, c = 3'
-    json_str = '{a : 1, b : 2, c : 3}'
-    converted_csv_str1 = params_dict.nested_csv_str_to_json_str(csv_str1)
-    converted_csv_str2 = params_dict.nested_csv_str_to_json_str(csv_str2)
-    self.assertEqual(converted_csv_str1, converted_csv_str2)
-    self.assertEqual(converted_csv_str1, json_str)
-    self.assertEqual(converted_csv_str2, json_str)
-
-  def test_gcs_added_quotes(self):
-    csv_str = 'a=gs://abc, b=gs://def'
-    expected_output = '{a : \'gs://abc\', b : \'gs://def\'}'
-    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
-    self.assertEqual(converted_csv_str, expected_output)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/base_model.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/base_model.py
deleted file mode 100644
index f81140e85ed1bef681378218440acb1d1a941a35..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/base_model.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ==============================================================================
-"""Abstraction of multi-task model."""
-from typing import Text, Dict
-
-import tensorflow as tf
-
-
-class MultiTaskBaseModel(tf.Module):
-  """Base class that holds multi-task model computation."""
-
-  def __init__(self, **kwargs):
-    super().__init__(**kwargs)
-    self._sub_tasks = self._instantiate_sub_tasks()
-
-  def _instantiate_sub_tasks(self) -> Dict[Text, tf.keras.Model]:
-    """Abstract function that sets up the computation for each sub-task.
-
-    Returns:
-      A map from task name (as string) to a tf.keras.Model object that
-        represents the sub-task in the multi-task pool.
-    """
-    raise NotImplementedError(
-        "_instantiate_sub_task_models() is not implemented.")
-
-  @property
-  def sub_tasks(self):
-    """Fetch a map of task name (string) to task model (tf.keras.Model)."""
-    return self._sub_tasks
-
-  def initialize(self):
-    """Optional function that loads a pre-train checkpoint."""
-    return
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/base_trainer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/base_trainer.py
deleted file mode 100644
index 4bcc17fa15909bce286f7ab6761f0c67ab93d22b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/base_trainer.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ==============================================================================
-"""Multitask base trainer implementation.
-
-The trainer derives from the Orbit `StandardTrainer` class.
-"""
-from typing import Union
-import gin
-import orbit
-import tensorflow as tf
-
-from official.modeling.multitask import base_model
-from official.modeling.multitask import multitask
-
-
-@gin.configurable
-class MultiTaskBaseTrainer(orbit.StandardTrainer):
-  """Multitask base trainer."""
-
-  def __init__(self,
-               multi_task: multitask.MultiTask,
-               multi_task_model: Union[tf.keras.Model,
-                                       base_model.MultiTaskBaseModel],
-               optimizer: tf.optimizers.Optimizer,
-               trainer_options=None):
-    self._strategy = tf.distribute.get_strategy()
-    self._multi_task = multi_task
-    self._multi_task_model = multi_task_model
-    self._optimizer = optimizer
-
-    self._training_losses = None
-    self._training_metrics = None
-    self._global_step = orbit.utils.create_global_step()
-
-    if hasattr(self.multi_task_model, "checkpoint_items"):
-      checkpoint_items = self.multi_task_model.checkpoint_items
-    else:
-      checkpoint_items = {}
-
-    self._checkpoint = tf.train.Checkpoint(
-        model=self.multi_task_model,
-        optimizer=self.optimizer,
-        global_step=self.global_step,
-        **checkpoint_items)
-
-    train_datasets = {}
-    for name, task in self.multi_task.tasks.items():
-      train_datasets[name] = orbit.utils.make_distributed_dataset(
-          self.strategy, task.build_inputs, task.task_config.train_data)
-
-    super().__init__(
-        train_dataset=train_datasets,
-        options=trainer_options or orbit.StandardTrainerOptions())
-
-  def train_loop_begin(self):
-    """Clean up states that hold losses and metrics."""
-    for _, train_loss_metric in self.training_losses.items():
-      train_loss_metric.reset_states()
-
-    for _, metrics in self.training_metrics.items():
-      for metric in metrics:
-        metric.reset_states()
-
-  def train_loop_end(self):
-    """Record loss and metric values per task."""
-    result = {}
-    for task_name, loss in self.training_losses.items():
-      result[task_name] = {loss.name: loss.result()}
-    for task_name, task_metrics in self.training_metrics.items():
-      result[task_name].update(
-          {metric.name: metric.result() for metric in task_metrics})
-    # Note that, the learning rate schedule is managed by the keras optimizer
-    # internally, which respects the number of backward pass as `iterations`.
-    # The learning rate schedule does not follow the trainer logical global
-    # step of multiple tasks.
-    if callable(self.optimizer.learning_rate):
-      result["learning_rate"] = self.optimizer.learning_rate(
-          self.optimizer.iterations)
-    else:
-      result["learning_rate"] = self.optimizer.learning_rate
-    return result
-
-  @property
-  def checkpoint(self):
-    """Accesses the training checkpoint."""
-    return self._checkpoint
-
-  @property
-  def training_losses(self):
-    """Access training loss metric objects for all tasks."""
-    if self._training_losses is None:
-      # Builds the per-task metrics and losses.
-      # This the total summed training loss of tasks in the joint training.
-      self._training_losses = dict(
-          total_loss=tf.keras.metrics.Mean("training_loss", dtype=tf.float32))
-      for name in self.multi_task.tasks:
-        self._training_losses[name] = tf.keras.metrics.Mean(
-            "training_loss", dtype=tf.float32)
-    return self._training_losses
-
-  @property
-  def training_metrics(self):
-    """Access training metric metric objects for all tasks."""
-    if self._training_metrics is None:
-      # Builds the per-task metrics and losses.
-      self._training_metrics = {}
-      for name, task in self.multi_task.tasks.items():
-        self._training_metrics[name] = task.build_metrics(training=True)
-    return self._training_metrics
-
-  @property
-  def strategy(self):
-    return self._strategy
-
-  @property
-  def multi_task(self):
-    return self._multi_task
-
-  @property
-  def multi_task_model(self):
-    return self._multi_task_model
-
-  @property
-  def optimizer(self):
-    return self._optimizer
-
-  @property
-  def global_step(self):
-    return self._global_step
-
-  def train_step(self, iterator_map):
-    """The default train step calling the multi-task train step.
-
-    Args:
-      iterator_map: a dictionary of task names and per-task dataset iterators.
-    """
-
-    def step_fn(inputs):
-      losses = self.multi_task.joint_train_step(
-          inputs,
-          multi_task_model=self.multi_task_model,
-          optimizer=self.optimizer,
-          task_metrics=self.training_metrics)
-      for key, loss in losses.items():
-        self.training_losses[key].update_state(loss)
-
-    self.strategy.run(
-        step_fn, args=(tf.nest.map_structure(next, iterator_map),))
-    self.global_step.assign_add(1)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/base_trainer_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/base_trainer_test.py
deleted file mode 100644
index 986845c025a8ee1a7334277ae7371fc9815dfcda..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/base_trainer_test.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for multitask.base_trainer."""
-from absl.testing import parameterized
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.modeling.multitask import base_trainer
-from official.modeling.multitask import configs
-from official.modeling.multitask import multitask
-from official.modeling.multitask import test_utils
-
-
-def all_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.default_strategy,
-          strategy_combinations.cloud_tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-      ],
-      mode="eager",
-  )
-
-
-class BaseTrainerTest(tf.test.TestCase, parameterized.TestCase):
-
-  @combinations.generate(all_strategy_combinations())
-  def test_multitask_joint_trainer(self, distribution):
-    with distribution.scope():
-      tasks = [
-          test_utils.MockFooTask(params=test_utils.FooConfig(), name="foo"),
-          test_utils.MockBarTask(params=test_utils.BarConfig(), name="bar")
-      ]
-      task_weights = {"foo": 1.0, "bar": 1.0}
-      test_multitask = multitask.MultiTask(
-          tasks=tasks, task_weights=task_weights)
-      test_optimizer = tf.keras.optimizers.SGD(0.1)
-      model = test_utils.MockMultiTaskModel()
-      test_trainer = base_trainer.MultiTaskBaseTrainer(
-          multi_task=test_multitask,
-          multi_task_model=model,
-          optimizer=test_optimizer)
-      results = test_trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertContainsSubset(["training_loss", "bar_acc"],
-                                results["bar"].keys())
-      self.assertContainsSubset(["training_loss", "foo_acc"],
-                                results["foo"].keys())
-
-  def test_trainer_with_configs(self):
-    config = configs.MultiTaskConfig(
-        task_routines=(configs.TaskRoutine(
-            task_name="foo",
-            task_config=test_utils.FooConfig(),
-            task_weight=0.5),
-                       configs.TaskRoutine(
-                           task_name="bar",
-                           task_config=test_utils.BarConfig(),
-                           task_weight=0.5)))
-    test_multitask = multitask.MultiTask.from_config(config)
-    test_optimizer = tf.keras.optimizers.SGD(0.1)
-    model = test_utils.MockMultiTaskModel()
-    test_trainer = base_trainer.MultiTaskBaseTrainer(
-        multi_task=test_multitask,
-        multi_task_model=model,
-        optimizer=test_optimizer)
-    results = test_trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
-    self.assertContainsSubset(["training_loss", "bar_acc"],
-                              results["bar"].keys())
-    self.assertContainsSubset(["training_loss", "foo_acc"],
-                              results["foo"].keys())
-    self.assertEqual(test_multitask.task_weight("foo"), 0.5)
-    self.assertEqual(test_trainer.global_step.numpy(), 5)
-    self.assertIn("learning_rate", results)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/configs.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/configs.py
deleted file mode 100644
index 1ce4674179501962c86fa06973a530e59cc8ed03..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/configs.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Configuration definitions for multi-task training."""
-from typing import Optional, Tuple
-
-import dataclasses
-
-from official.core import config_definitions as cfg
-from official.modeling import hyperparams
-
-
-@dataclasses.dataclass
-class TaskRoutine(hyperparams.Config):
-  task_name: str = ""
-  task_config: cfg.TaskConfig = None
-  eval_steps: Optional[int] = None
-  task_weight: Optional[float] = 1.0
-
-
-@dataclasses.dataclass
-class MultiTaskConfig(hyperparams.Config):
-  init_checkpoint: str = ""
-  model: hyperparams.Config = None
-  task_routines: Tuple[TaskRoutine, ...] = ()
-
-
-@dataclasses.dataclass
-class ProportionalSampleConfig(hyperparams.Config):
-  alpha: float = 1.0
-
-
-@dataclasses.dataclass
-class AnnealingSampleConfig(hyperparams.Config):
-  steps_per_epoch: int = 5
-  total_steps: int = 20
-
-
-@dataclasses.dataclass
-class TaskSamplingConfig(hyperparams.OneOfConfig):
-  type: str = ""
-  uniform: hyperparams.Config = hyperparams.Config()
-  proportional: ProportionalSampleConfig = ProportionalSampleConfig()
-  annealing: AnnealingSampleConfig = AnnealingSampleConfig()
-
-
-@dataclasses.dataclass
-class MultiTaskTrainerConfig(cfg.TrainerConfig):
-  trainer_type: str = "interleaving"
-  task_sampler: TaskSamplingConfig = TaskSamplingConfig(type="proportional")
-
-
-@dataclasses.dataclass
-class MultiTaskExperimentConfig(hyperparams.Config):
-  """An experiment config for multi-task training and multi-task evaluation."""
-  task: MultiTaskConfig = MultiTaskConfig()
-  trainer: MultiTaskTrainerConfig = MultiTaskTrainerConfig()
-  runtime: cfg.RuntimeConfig = cfg.RuntimeConfig()
-
-
-@dataclasses.dataclass
-class MultiEvalExperimentConfig(cfg.ExperimentConfig):
-  """An experiment config for single-task training and multi-task evaluation.
-
-  Attributes:
-    eval_tasks: individual evaluation tasks.
-  """
-  eval_tasks: MultiTaskConfig = MultiTaskConfig()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/evaluator.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/evaluator.py
deleted file mode 100644
index ec8551d88c9fb22c2c7d908b6a74a80bdea6fa5d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/evaluator.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Multitask Evaluator implementation.
-
-The evaluator implements the Orbit `AbstractEvaluator` interface.
-"""
-from typing import Optional, Union
-import gin
-import orbit
-import tensorflow as tf
-
-from official.core import train_utils
-from official.modeling.multitask import base_model
-from official.modeling.multitask import multitask
-
-
-@gin.configurable
-class MultiTaskEvaluator(orbit.AbstractEvaluator):
-  """Implements the common trainer shared for TensorFlow models."""
-
-  def __init__(
-      self,
-      task: multitask.MultiTask,
-      model: Union[tf.keras.Model, base_model.MultiTaskBaseModel],
-      global_step: Optional[tf.Variable] = None,
-      checkpoint_exporter: Optional[train_utils.BestCheckpointExporter] = None):
-    """Initialize common trainer for TensorFlow models.
-
-    Args:
-      task: A multitask.MultiTask instance.
-      model: tf.keras.Model instance.
-      global_step: the global step variable.
-      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
-        interface.
-    """
-    # Gets the current distribution strategy. If not inside any strategy scope,
-    # it gets a single-replica no-op strategy.
-    self._strategy = tf.distribute.get_strategy()
-    self._task = task
-    self._model = model
-    self._global_step = global_step or orbit.utils.create_global_step()
-    self._checkpoint_exporter = checkpoint_exporter
-    self._checkpoint = tf.train.Checkpoint(
-        global_step=self.global_step,
-        model=self.model)
-
-    self._validation_losses = None
-    self._validation_metrics = None
-
-    # Builds per-task datasets.
-    self.eval_datasets = {}
-    for name, task in self.task.tasks.items():
-      self.eval_datasets[name] = orbit.utils.make_distributed_dataset(
-          self.strategy, task.build_inputs, task.task_config.validation_data)
-
-    # Builds per-task validation loops.
-    def get_function(task_name, task):
-
-      task_metrics = self.validation_metrics[task_name]
-      task_loss = self.validation_losses[task_name]
-      if isinstance(self.model, base_model.MultiTaskBaseModel):
-        model = self.model.sub_tasks[task_name]
-      else:
-        model = self.model
-
-      def step_fn(inputs):
-        logs = task.validation_step(inputs, model=model, metrics=task_metrics)
-        task_loss.update_state(logs[task.loss])
-        return logs
-
-      @tf.function
-      def eval_step_fn(iterator):
-        distributed_outputs = self.strategy.run(step_fn, args=(next(iterator),))
-        return tf.nest.map_structure(self.strategy.experimental_local_results,
-                                     distributed_outputs)
-
-      return orbit.utils.create_loop_fn(eval_step_fn)
-
-    self.task_fns = {
-        name: get_function(name, task)
-        for name, task in self.task.tasks.items()
-    }
-
-  @property
-  def strategy(self):
-    return self._strategy
-
-  @property
-  def task(self):
-    return self._task
-
-  @property
-  def model(self):
-    return self._model
-
-  @property
-  def global_step(self):
-    return self._global_step
-
-  @property
-  def validation_losses(self):
-    """Accesses the validation loss metric object."""
-    if self._validation_losses is None:
-      # Builds the per-task metrics and losses.
-      self._validation_losses = {}
-      for name in self.task.tasks:
-        self._validation_losses[name] = tf.keras.metrics.Mean(
-            "validation_loss", dtype=tf.float32)
-    return self._validation_losses
-
-  @property
-  def validation_metrics(self):
-    """Accesses all validation metric metric objects."""
-    if self._validation_metrics is None:
-      # Builds the per-task metrics and losses.
-      self._validation_metrics = {}
-      for name, task in self.task.tasks.items():
-        self._validation_metrics[name] = task.build_metrics(training=False)
-    return self._validation_metrics
-
-  @property
-  def checkpoint(self):
-    """Accesses the training checkpoint."""
-    return self._checkpoint
-
-  def evaluate(self, num_steps: tf.Tensor):
-    """Performs evaluation for each `EvalTask`."""
-    for metric in self.validation_losses.values():
-      metric.reset_states()
-    for metrics in self.validation_metrics.values():
-      for metric in metrics:
-        metric.reset_states()
-    results = {}
-    eval_iters = tf.nest.map_structure(iter, self.eval_datasets)
-
-    for name, task_eval_loop in self.task_fns.items():
-      outputs = None
-      eval_iter = eval_iters[name]
-      task = self.task.tasks[name]
-      task_eval_steps = self.task.task_eval_steps(name) or num_steps
-      outputs = task_eval_loop(
-          eval_iter,
-          task_eval_steps,
-          state=outputs,
-          reduce_fn=task.aggregate_logs)
-      task_metrics = self.validation_metrics[name]
-      task_loss = self.validation_losses[name]
-      logs = {}
-      for metric in task_metrics + [task_loss]:
-        logs[metric.name] = metric.result()
-      if outputs:
-        metrics = task.reduce_aggregated_logs(
-            outputs, global_step=self.global_step)
-        logs.update(metrics)
-      results[name] = logs
-
-    if self._checkpoint_exporter:
-      self._checkpoint_exporter.maybe_export_checkpoint(
-          self.checkpoint, results, self.global_step.numpy())
-    return results
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/evaluator_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/evaluator_test.py
deleted file mode 100644
index 7d3650b23d5d0554d8c6db5ef5cbdf6ccea78476..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/evaluator_test.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for multitask.evaluator."""
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.core import base_task
-from official.core import config_definitions as cfg
-from official.modeling.multitask import evaluator
-from official.modeling.multitask import multitask
-
-
-def all_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.default_strategy,
-          strategy_combinations.cloud_tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-      ],
-      mode="eager",
-  )
-
-
-class MockModel(tf.keras.Model):
-
-  def __init__(self, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self.dense = tf.keras.layers.Dense(1)
-
-  def call(self, inputs):
-    print(inputs, type(inputs))
-    if "y" in inputs:
-      self.add_loss(tf.zeros((1,), dtype=tf.float32))
-    else:
-      self.add_loss(tf.ones((1,), dtype=tf.float32))
-    return self.dense(inputs["x"])
-
-
-class MockTask(base_task.Task):
-  """Mock task object for testing."""
-
-  def build_metrics(self, training: bool = True):
-    del training
-    return [tf.keras.metrics.Accuracy(name="acc")]
-
-  def build_inputs(self, params):
-
-    def generate_data(_):
-      x = tf.zeros(shape=(2,), dtype=tf.float32)
-      label = tf.zeros([1], dtype=tf.int32)
-      if self.name == "bar":
-        return dict(x=x, y=x), label
-      else:
-        return dict(x=x), label
-
-    dataset = tf.data.Dataset.range(1)
-    dataset = dataset.repeat()
-    dataset = dataset.map(
-        generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
-
-  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
-    logs = super().validation_step(inputs, model, metrics)
-    logs["counter"] = tf.ones((1,), dtype=tf.float32)
-    return logs
-
-  def aggregate_logs(self, state, step_outputs):
-    if state is None:
-      state = {}
-    for key, value in step_outputs.items():
-      if key not in state:
-        state[key] = []
-      state[key].append(
-          np.concatenate([np.expand_dims(v.numpy(), axis=0) for v in value]))
-    return state
-
-  def reduce_aggregated_logs(self,
-                             aggregated_logs,
-                             global_step=None):
-    for k, v in aggregated_logs.items():
-      aggregated_logs[k] = np.sum(np.stack(v, axis=0))
-    return aggregated_logs
-
-
-class EvaluatorTest(tf.test.TestCase, parameterized.TestCase):
-
-  @combinations.generate(all_strategy_combinations())
-  def test_multitask_evaluator(self, distribution):
-    with distribution.scope():
-      tasks = [
-          MockTask(params=cfg.TaskConfig(), name="bar"),
-          MockTask(params=cfg.TaskConfig(), name="foo")
-      ]
-      test_multitask = multitask.MultiTask(tasks=tasks)
-      model = MockModel()
-      test_evaluator = evaluator.MultiTaskEvaluator(
-          task=test_multitask, model=model)
-      results = test_evaluator.evaluate(tf.convert_to_tensor(1, dtype=tf.int32))
-    self.assertContainsSubset(["validation_loss", "acc"], results["bar"].keys())
-    self.assertContainsSubset(["validation_loss", "acc"], results["foo"].keys())
-    self.assertEqual(results["bar"]["validation_loss"], 0.0)
-    self.assertEqual(results["foo"]["validation_loss"], 1.0)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_multitask_evaluator_numpy_metrics(self, distribution):
-    with distribution.scope():
-      tasks = [
-          MockTask(params=cfg.TaskConfig(), name="bar"),
-          MockTask(params=cfg.TaskConfig(), name="foo")
-      ]
-      test_multitask = multitask.MultiTask(tasks=tasks)
-      model = MockModel()
-      test_evaluator = evaluator.MultiTaskEvaluator(
-          task=test_multitask, model=model)
-      results = test_evaluator.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
-    self.assertEqual(results["bar"]["counter"],
-                     5. * distribution.num_replicas_in_sync)
-    self.assertEqual(results["foo"]["counter"],
-                     5. * distribution.num_replicas_in_sync)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/interleaving_trainer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/interleaving_trainer.py
deleted file mode 100644
index adfdfa409f0aa82884be02d1627a7cb9a86e740e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/interleaving_trainer.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Multitask trainer that interleaves each task's train step."""
-from typing import Union
-import gin
-import orbit
-import tensorflow as tf
-from official.modeling.multitask import base_model
-from official.modeling.multitask import base_trainer
-from official.modeling.multitask import multitask
-from official.modeling.multitask import task_sampler as sampler
-
-
-@gin.configurable
-class MultiTaskInterleavingTrainer(base_trainer.MultiTaskBaseTrainer):
-  """MultiTask trainer that interleaves task update."""
-
-  def __init__(self,
-               multi_task: multitask.MultiTask,
-               multi_task_model: Union[tf.keras.Model,
-                                       base_model.MultiTaskBaseModel],
-               optimizer: tf.optimizers.Optimizer,
-               task_sampler: sampler.TaskSampler,
-               trainer_options=None):
-    super(MultiTaskInterleavingTrainer, self).__init__(
-        multi_task=multi_task,
-        multi_task_model=multi_task_model,
-        optimizer=optimizer,
-        trainer_options=trainer_options)
-    self._task_sampler = task_sampler
-
-    # Build per task train step.
-    def _get_task_step(task_name, task):
-
-      def step_fn(inputs):
-        if isinstance(self.multi_task_model, base_model.MultiTaskBaseModel):
-          task_model = self.multi_task_model.sub_tasks[task_name]
-        else:
-          task_model = self.multi_task_model
-        task_logs = task.train_step(
-            inputs,
-            model=task_model,
-            optimizer=self.optimizer,
-            metrics=self.training_metrics[task_name])
-        self.training_losses[task_name].update_state(task_logs[task.loss])
-
-      return step_fn
-
-    self._task_train_step_map = {
-        name: _get_task_step(name, task)
-        for name, task in self.multi_task.tasks.items()
-    }
-
-    # TODO(haozhangthu): Add taskwise step counter to train_loop_end for logging
-    # on TensorBoard.
-    self._task_step_counters = {
-        name: orbit.utils.create_global_step() for name in self.multi_task.tasks
-    }
-
-  def task_step_counter(self, name):
-    return self._task_step_counters[name]
-
-  def train_step(self, iterator_map):
-    # Sample one task to train according to a multinomial distribution
-    rn = tf.random.stateless_uniform(shape=[], seed=(0, self.global_step))
-    cumulative_sample_distribution = self._task_sampler.task_cumulative_distribution(
-        self.global_step)
-    # Prepend a [0.0] for indexing convenience.
-    cumulative_sample_distribution = tf.concat(
-        [tf.constant([0.0], dtype=tf.float32), cumulative_sample_distribution],
-        axis=0)
-
-    for idx, (name, _) in enumerate(self.multi_task.tasks.items()):
-      begin = cumulative_sample_distribution[idx]
-      end = cumulative_sample_distribution[idx + 1]
-      if rn >= begin and rn < end:
-        self._strategy.run(
-            self._task_train_step_map[name], args=(next(iterator_map[name]),))
-        self.global_step.assign_add(1)
-        self.task_step_counter(name).assign_add(1)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/interleaving_trainer_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/interleaving_trainer_test.py
deleted file mode 100644
index b3318743d1cecd696c3403a66998fea395856889..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/interleaving_trainer_test.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for multitask.interleaving_trainer."""
-from absl.testing import parameterized
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.modeling.multitask import configs
-from official.modeling.multitask import interleaving_trainer
-from official.modeling.multitask import multitask
-from official.modeling.multitask import task_sampler
-from official.modeling.multitask import test_utils
-
-
-def all_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.default_strategy,
-          strategy_combinations.cloud_tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-      ],
-      mode="eager",
-  )
-
-
-class InterleavingTrainerTest(tf.test.TestCase, parameterized.TestCase):
-
-  @combinations.generate(all_strategy_combinations())
-  def test_multitask_interleaving_trainer(self, distribution):
-    with distribution.scope():
-      tasks = [
-          test_utils.MockFooTask(params=test_utils.FooConfig(), name="foo"),
-          test_utils.MockBarTask(params=test_utils.BarConfig(), name="bar")
-      ]
-      test_multitask = multitask.MultiTask(tasks=tasks)
-      test_optimizer = tf.keras.optimizers.SGD(0.1)
-      model = test_utils.MockMultiTaskModel()
-      sampler = task_sampler.UniformTaskSampler(
-          task_weights=test_multitask.task_weights)
-      test_trainer = interleaving_trainer.MultiTaskInterleavingTrainer(
-          multi_task=test_multitask,
-          multi_task_model=model,
-          optimizer=test_optimizer,
-          task_sampler=sampler)
-      results = test_trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertContainsSubset(["training_loss", "bar_acc"],
-                                results["bar"].keys())
-      self.assertContainsSubset(["training_loss", "foo_acc"],
-                                results["foo"].keys())
-
-  @combinations.generate(all_strategy_combinations())
-  def test_trainer_with_configs(self, distribution):
-    config = configs.MultiTaskConfig(
-        task_routines=(configs.TaskRoutine(
-            task_name="foo",
-            task_config=test_utils.FooConfig(),
-            task_weight=3.0),
-                       configs.TaskRoutine(
-                           task_name="bar",
-                           task_config=test_utils.BarConfig(),
-                           task_weight=1.0)))
-    with distribution.scope():
-      test_multitask = multitask.MultiTask.from_config(config)
-    test_optimizer = tf.keras.optimizers.SGD(0.1)
-    model = test_utils.MockMultiTaskModel()
-    num_step = 1000
-    sampler = task_sampler.AnnealingTaskSampler(
-        task_weights=test_multitask.task_weights,
-        steps_per_epoch=num_step/5,
-        total_steps=num_step)
-    test_trainer = interleaving_trainer.MultiTaskInterleavingTrainer(
-        multi_task=test_multitask,
-        multi_task_model=model,
-        optimizer=test_optimizer,
-        task_sampler=sampler)
-    results = test_trainer.train(tf.convert_to_tensor(num_step, dtype=tf.int32))
-    self.assertContainsSubset(["training_loss", "bar_acc"],
-                              results["bar"].keys())
-    self.assertContainsSubset(["training_loss", "foo_acc"],
-                              results["foo"].keys())
-    self.assertEqual(test_trainer.global_step.numpy(), num_step)
-    bar_sampled_step = test_trainer.task_step_counter("bar").numpy()
-    foo_sampled_step = test_trainer.task_step_counter("foo").numpy()
-    self.assertEqual(bar_sampled_step + foo_sampled_step, num_step)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/multitask.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/multitask.py
deleted file mode 100644
index ef8e720c6c60f752ae774b47a63b344a70a49b6a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/multitask.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Experimental MultiTask base class for multi-task training/evaluation."""
-import abc
-from typing import Dict, List, Optional, Text, Union
-
-import tensorflow as tf
-from official.core import base_task
-from official.core import config_definitions
-from official.core import task_factory
-from official.modeling import optimization
-from official.modeling.multitask import base_model
-from official.modeling.multitask import configs
-
-OptimizationConfig = optimization.OptimizationConfig
-RuntimeConfig = config_definitions.RuntimeConfig
-
-
-class MultiTask(tf.Module, metaclass=abc.ABCMeta):
-  """A multi-task class to manage multiple tasks."""
-
-  def __init__(self,
-               tasks: Union[Dict[Text, base_task.Task], List[base_task.Task]],
-               task_weights: Optional[Dict[str, Union[float, int]]] = None,
-               task_eval_steps: Optional[Dict[str, int]] = None,
-               name: Optional[str] = None):
-    """MultiTask initialization.
-
-    Args:
-      tasks: a list or a flat dict of Task.
-      task_weights: a dict of (task, task weight), task weight can be applied
-        directly during loss summation in a joint backward step, or it can be
-        used to sample task among interleaved backward step.
-      task_eval_steps: a dict of (task, eval steps).
-      name: the instance name of a MultiTask object.
-    """
-    super().__init__(name=name)
-    if isinstance(tasks, list):
-      self._tasks = {}
-      for task in tasks:
-        if task.name in self._tasks:
-          raise ValueError("Duplicated tasks found, task.name is %s" %
-                           task.name)
-        self._tasks[task.name] = task
-    elif isinstance(tasks, dict):
-      self._tasks = tasks
-    else:
-      raise ValueError("The tasks argument has an invalid type: %s" %
-                       type(tasks))
-    self._task_eval_steps = task_eval_steps or {}
-    self._task_eval_steps = dict([
-        (name, self._task_eval_steps.get(name, None)) for name in self.tasks
-    ])
-    self._task_weights = task_weights or {}
-    self._task_weights = dict([
-        (name, self._task_weights.get(name, 1.0)) for name in self.tasks
-    ])
-
-  @classmethod
-  def from_config(cls, config: configs.MultiTaskConfig, logging_dir=None):
-    tasks = {}
-    task_eval_steps = {}
-    task_weights = {}
-    for task_routine in config.task_routines:
-      task_name = task_routine.task_name
-      tasks[task_name] = task_factory.get_task(
-          task_routine.task_config, logging_dir=logging_dir)
-      task_eval_steps[task_name] = task_routine.eval_steps
-      task_weights[task_name] = task_routine.task_weight
-    return cls(
-        tasks, task_eval_steps=task_eval_steps, task_weights=task_weights)
-
-  @property
-  def tasks(self):
-    return self._tasks
-
-  def task_eval_steps(self, task_name):
-    return self._task_eval_steps[task_name]
-
-  def task_weight(self, task_name):
-    return self._task_weights[task_name]
-
-  @property
-  def task_weights(self):
-    return self._task_weights
-
-  @classmethod
-  def create_optimizer(cls,
-                       optimizer_config: OptimizationConfig,
-                       runtime_config: Optional[RuntimeConfig] = None):
-    return base_task.Task.create_optimizer(
-        optimizer_config=optimizer_config, runtime_config=runtime_config)
-
-  def joint_train_step(self, task_inputs,
-                       multi_task_model: base_model.MultiTaskBaseModel,
-                       optimizer: tf.keras.optimizers.Optimizer, task_metrics):
-    """The joint train step.
-
-    Args:
-      task_inputs: a dictionary of task names and per-task features.
-      multi_task_model: a MultiTaskBaseModel instance.
-      optimizer: a tf.optimizers.Optimizer.
-      task_metrics: a dictionary of task names and per-task metrics.
-
-    Returns:
-      A dictionary of losses, inculding per-task losses and their weighted sum.
-    """
-    losses = {}
-    with tf.GradientTape() as tape:
-      total_loss = 0.0
-      for name, model in multi_task_model.sub_tasks.items():
-        inputs = task_inputs[name]
-        if isinstance(inputs, tuple) and len(inputs) == 2:
-          features, labels = inputs
-        elif isinstance(inputs, dict):
-          features, labels = inputs, inputs
-        else:
-          raise ValueError("The iterator output is neither a tuple nor a "
-                           "dictionary. It is not implemented to support "
-                           "such outputs.")
-        outputs = model(features, training=True)
-        task_loss = self.tasks[name].build_losses(labels, outputs)
-        task_weight = self.task_weight(name)
-        total_loss += task_weight * task_loss
-        losses[name] = task_loss
-        self.tasks[name].process_metrics(task_metrics[name], labels, outputs)
-
-        # Scales loss as the default gradients allreduce performs sum inside
-        # the optimizer.
-        scaled_loss = total_loss / tf.distribute.get_strategy(
-        ).num_replicas_in_sync
-    tvars = multi_task_model.trainable_variables
-    grads = tape.gradient(scaled_loss, tvars)
-    optimizer.apply_gradients(list(zip(grads, tvars)))
-    losses["total_loss"] = total_loss
-    return losses
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/task_sampler.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/task_sampler.py
deleted file mode 100644
index e37b7a55481921f348657a661099a9973b17cf9e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/task_sampler.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Utils to sample tasks for interleaved optimization."""
-import abc
-from typing import Union, Dict, Text
-import tensorflow as tf
-
-from official.modeling.multitask import configs
-
-
-class TaskSampler(tf.Module, metaclass=abc.ABCMeta):
-  """An abstract class defining task sampling API for interleaving trainer."""
-
-  def __init__(self, task_weights: Dict[Text, Union[float, int]]):
-    self._task_weights = task_weights
-
-  @abc.abstractmethod
-  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
-    """Compute cumulative distribution to sample tasks.
-
-    It calculates the cumulative distribution of the multinomial task
-    distribution with respect to which to be sampled against.
-
-    Args:
-      global_step: A tensor indicating current progess of training.
-
-    Returns:
-      A float tensor with shape (#(task), 1) that represents the cumulative
-        sampling distribution.
-    """
-    pass
-
-
-class UniformTaskSampler(TaskSampler):
-  """Sample all tasks uniformly."""
-
-  def __init__(self, task_weights: Dict[Text, Union[float, int]]):
-    super(UniformTaskSampler, self).__init__(task_weights=task_weights)
-    self._uniform_cumulative = tf.math.cumsum(
-        tf.constant(
-            [1.0 / len(self._task_weights)] * len(self._task_weights),
-            dtype=tf.float32))
-
-  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
-    del global_step
-    return self._uniform_cumulative
-
-
-class ProportionalTaskSampler(TaskSampler):
-  """Sample tasks proportional to task weights."""
-
-  def __init__(self,
-               task_weights: Dict[Text, Union[float, int]],
-               alpha: float = 1.0):
-    super(ProportionalTaskSampler, self).__init__(task_weights=task_weights)
-    self._alpha = tf.cast(alpha, dtype=tf.float32)
-    task_weight_dict_ordered_list = tf.constant(
-        [weight for _, weight in self._task_weights.items()], dtype=tf.float32)
-    task_sizes = tf.math.pow(task_weight_dict_ordered_list, self._alpha)
-    task_distribution = task_sizes / tf.reduce_sum(task_sizes)
-    self._porportional_cumulative = tf.math.cumsum(task_distribution)
-
-  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
-    del global_step
-    return self._porportional_cumulative
-
-
-class AnnealingTaskSampler(TaskSampler):
-  """Sample tasks according to task weights as well as training progress."""
-
-  def __init__(self,
-               task_weights: Dict[Text, Union[float, int]],
-               steps_per_epoch: int,
-               total_steps: int):
-    super(AnnealingTaskSampler, self).__init__(task_weights=task_weights)
-    self._steps_per_epoch = tf.cast(steps_per_epoch, dtype=tf.float32)
-    self._total_epochs = tf.cast(
-        total_steps / self._steps_per_epoch, dtype=tf.float32)
-
-  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
-    cur_epoch = tf.math.floor(
-        tf.cast(global_step, dtype=tf.float32) / self._steps_per_epoch)
-    alpha = 1.0 - 0.8 * (cur_epoch - 1) / (self._total_epochs - 1 + 1e-10)
-    task_weight_dict_ordered_list = [
-        weight for _, weight in self._task_weights.items()
-    ]
-    task_sizes = tf.math.pow(
-        tf.constant(task_weight_dict_ordered_list, dtype=tf.float32),
-        tf.cast(alpha, dtype=tf.float32))
-    dynamic_task_distribution = task_sizes / tf.reduce_sum(task_sizes)
-    return tf.math.cumsum(dynamic_task_distribution)
-
-
-def get_task_sampler(config: configs.TaskSamplingConfig,
-                     task_weights: Dict[Text, float]) -> TaskSampler:
-  """Utils to create task sampler with configuration and task weights."""
-  oneof_config = config.get()
-  if config.type == 'uniform':
-    return UniformTaskSampler(task_weights=task_weights)
-  elif config.type == 'proportional':
-    return ProportionalTaskSampler(
-        task_weights=task_weights, alpha=oneof_config.alpha)
-  elif config.type == 'annealing':
-    return AnnealingTaskSampler(
-        task_weights=task_weights,
-        steps_per_epoch=oneof_config.steps_per_epoch,
-        total_steps=oneof_config.total_steps)
-  else:
-    raise RuntimeError('Task sampler type not supported')
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/task_sampler_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/task_sampler_test.py
deleted file mode 100644
index afd17cbdee86f1470a19c3f345b5e03cf36cef40..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/task_sampler_test.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for multitask.task_sampler."""
-import tensorflow as tf
-
-from official.modeling.multitask import configs
-from official.modeling.multitask import task_sampler as sampler
-
-
-class TaskSamplerTest(tf.test.TestCase):
-
-  def setUp(self):
-    super(TaskSamplerTest, self).setUp()
-    self._task_weights = {'A': 1.0, 'B': 2.0, 'C': 3.0}
-
-  def test_uniform_sample_distribution(self):
-    uniform_sampler = sampler.get_task_sampler(
-        configs.TaskSamplingConfig(type='uniform'), self._task_weights)
-    for step in range(5):
-      cumulative_distribution = uniform_sampler.task_cumulative_distribution(
-          tf.constant(step, dtype=tf.int64))
-      self.assertAllClose([0.333333, 0.666666, 1.0],
-                          cumulative_distribution.numpy())
-
-  def test_proportional_sample_distribution(self):
-    prop_sampler = sampler.get_task_sampler(
-        configs.TaskSamplingConfig(
-            type='proportional',
-            proportional=configs.ProportionalSampleConfig(alpha=2.0)),
-        self._task_weights)
-    # CucmulativeOf(Normalize([1.0^2, 2.0^2, 3.0^2]))
-    for step in range(5):
-      cumulative_distribution = prop_sampler.task_cumulative_distribution(
-          tf.constant(step, dtype=tf.int64))
-      self.assertAllClose([0.07142857, 0.35714286, 1.0],
-                          cumulative_distribution.numpy())
-
-  def test_annealing_sample_distribution(self):
-    num_epoch = 3
-    step_per_epoch = 6
-    annel_sampler = sampler.get_task_sampler(
-        configs.TaskSamplingConfig(
-            type='annealing',
-            annealing=configs.AnnealingSampleConfig(
-                steps_per_epoch=step_per_epoch,
-                total_steps=step_per_epoch * num_epoch)), self._task_weights)
-
-    global_step = tf.Variable(
-        0, dtype=tf.int64, name='global_step', trainable=False)
-    expected_cumulative_epochs = [[0.12056106, 0.4387236, 1.0],
-                                  [0.16666667, 0.5, 1.0],
-                                  [0.22477472, 0.5654695, 1.0]]
-    for epoch in range(num_epoch):
-      for _ in range(step_per_epoch):
-        cumulative_distribution = annel_sampler.task_cumulative_distribution(
-            tf.constant(global_step, dtype=tf.int64))
-        global_step.assign_add(1)
-        self.assertAllClose(expected_cumulative_epochs[epoch],
-                            cumulative_distribution.numpy())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/test_utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/test_utils.py
deleted file mode 100644
index 5f4de8b8d2ed373e55170587b83414a61dde1daf..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/test_utils.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Testing utils for mock models and tasks."""
-from typing import Dict, Text
-import tensorflow as tf
-from official.core import base_task
-from official.core import config_definitions as cfg
-from official.core import task_factory
-from official.modeling.multitask import base_model
-
-
-class MockFooModel(tf.keras.Model):
-  """A mock model can consume 'foo' and 'bar' inputs."""
-
-  def __init__(self, shared_layer, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self._share_layer = shared_layer
-    self._foo_specific_layer = tf.keras.layers.Dense(1)
-
-  def call(self, inputs):
-    self.add_loss(tf.zeros((1,), dtype=tf.float32))
-    if "foo" in inputs:
-      input_tensor = inputs["foo"]
-    else:
-      input_tensor = inputs["bar"]
-    return self._foo_specific_layer(self._share_layer(input_tensor))
-
-
-class MockBarModel(tf.keras.Model):
-
-  def __init__(self, shared_layer, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    self._share_layer = shared_layer
-    self._bar_specific_layer = tf.keras.layers.Dense(1)
-
-  def call(self, inputs):
-    self.add_loss(tf.zeros((2,), dtype=tf.float32))
-    return self._bar_specific_layer(self._share_layer(inputs["bar"]))
-
-
-class MockMultiTaskModel(base_model.MultiTaskBaseModel):
-
-  def __init__(self, *args, **kwargs):
-    self._shared_dense = tf.keras.layers.Dense(1)
-    super().__init__(*args, **kwargs)
-
-  def _instantiate_sub_tasks(self) -> Dict[Text, tf.keras.Model]:
-    return {
-        "foo": MockFooModel(self._shared_dense),
-        "bar": MockBarModel(self._shared_dense)
-    }
-
-
-def mock_data(feature_name):
-  """Mock dataset function."""
-
-  def _generate_data(_):
-    x = tf.zeros(shape=(2,), dtype=tf.float32)
-    label = tf.zeros([1], dtype=tf.int32)
-    return {feature_name: x}, label
-
-  dataset = tf.data.Dataset.range(1)
-  dataset = dataset.repeat()
-  dataset = dataset.map(
-      _generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
-
-
-class FooConfig(cfg.TaskConfig):
-  pass
-
-
-class BarConfig(cfg.TaskConfig):
-  pass
-
-
-@task_factory.register_task_cls(FooConfig)
-class MockFooTask(base_task.Task):
-  """Mock foo task object for testing."""
-
-  def build_metrics(self, training: bool = True):
-    del training
-    return [tf.keras.metrics.Accuracy(name="foo_acc")]
-
-  def build_inputs(self, params):
-    return mock_data("foo")
-
-  def build_model(self) -> tf.keras.Model:
-    return MockFooModel(shared_layer=tf.keras.layers.Dense(1))
-
-  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
-    if aux_losses:
-      loss += tf.add_n(aux_losses)
-    return tf.reduce_mean(loss)
-
-
-@task_factory.register_task_cls(BarConfig)
-class MockBarTask(base_task.Task):
-  """Mock bar task object for testing."""
-
-  def build_metrics(self, training: bool = True):
-    del training
-    return [tf.keras.metrics.Accuracy(name="bar_acc")]
-
-  def build_inputs(self, params):
-    return mock_data("bar")
-
-  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
-    if aux_losses:
-      loss += tf.add_n(aux_losses)
-    return tf.reduce_mean(loss)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/train_lib.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/train_lib.py
deleted file mode 100644
index 6c3273fb3145ff9786e1eb65aafeb50e45c7ed24..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/train_lib.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Multitask training driver library."""
-# pytype: disable=attribute-error
-import os
-from typing import Optional
-from absl import logging
-import orbit
-import tensorflow as tf
-from official.core import base_task
-from official.core import base_trainer as core_lib
-from official.core import train_utils
-from official.modeling.multitask import base_model
-from official.modeling.multitask import base_trainer
-from official.modeling.multitask import configs
-from official.modeling.multitask import evaluator as evaluator_lib
-from official.modeling.multitask import interleaving_trainer
-from official.modeling.multitask import multitask
-from official.modeling.multitask import task_sampler
-
-TRAINERS = {
-    'interleaving': interleaving_trainer.MultiTaskInterleavingTrainer,
-    'joint': base_trainer.MultiTaskBaseTrainer
-}
-
-
-def run_experiment(*, distribution_strategy: tf.distribute.Strategy,
-                   task: multitask.MultiTask,
-                   model: base_model.MultiTaskBaseModel, mode: str,
-                   params: configs.MultiTaskExperimentConfig,
-                   model_dir: str) -> base_model.MultiTaskBaseModel:
-  """Runs train/eval configured by the experiment params.
-
-  Args:
-    distribution_strategy: A distribution distribution_strategy.
-    task: A MultiTaskTask instance.
-    model: A MultiTaskBaseModel instance.
-    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
-      or 'continuous_eval'.
-    params: ExperimentConfig instance.
-    model_dir: A 'str', a path to store model checkpoints and summaries.
-
-  Returns:
-      model: `base_model.MultiTaskBaseModel` instance.
-  """
-
-  is_training = 'train' in mode
-  is_eval = 'eval' in mode
-  with distribution_strategy.scope():
-    optimizer = task.create_optimizer(params.trainer.optimizer_config,
-                                      params.runtime)
-    kwargs = dict(multi_task=task, multi_task_model=model, optimizer=optimizer)
-    if params.trainer.trainer_type == 'interleaving':
-      sampler = task_sampler.get_task_sampler(params.trainer.task_sampler,
-                                              task.task_weights)
-      kwargs.update(dict(task_sampler=sampler))
-    trainer = TRAINERS[params.trainer.trainer_type](
-        **kwargs) if is_training else None
-    if is_eval:
-      evaluator = evaluator_lib.MultiTaskEvaluator(
-          task=task,
-          model=model,
-          global_step=trainer.global_step if is_training else None)
-    else:
-      evaluator = None
-
-  if trainer:
-    checkpoint = trainer.checkpoint
-    global_step = trainer.global_step
-  else:
-    checkpoint = evaluator.checkpoint
-    global_step = evaluator.global_step
-
-  # TODO(hongkuny,haozhangthu): Revisit initialization method.
-  checkpoint_manager = tf.train.CheckpointManager(
-      checkpoint,
-      directory=model_dir,
-      max_to_keep=params.trainer.max_to_keep,
-      step_counter=global_step,
-      checkpoint_interval=params.trainer.checkpoint_interval,
-      init_fn=model.initialize)
-
-  controller = orbit.Controller(
-      strategy=distribution_strategy,
-      trainer=trainer,
-      evaluator=evaluator,
-      global_step=global_step,
-      steps_per_loop=params.trainer.steps_per_loop,
-      checkpoint_manager=checkpoint_manager,
-      summary_dir=os.path.join(model_dir, 'train'),
-      eval_summary_dir=os.path.join(model_dir, 'validation'),
-      summary_interval=params.trainer.summary_interval)
-
-  logging.info('Starts to execute mode: %s', mode)
-  with distribution_strategy.scope():
-    if mode == 'train':
-      controller.train(steps=params.trainer.train_steps)
-    elif mode == 'train_and_eval':
-      controller.train_and_evaluate(
-          train_steps=params.trainer.train_steps,
-          eval_steps=params.trainer.validation_steps,
-          eval_interval=params.trainer.validation_interval)
-    elif mode == 'eval':
-      controller.evaluate(steps=params.trainer.validation_steps)
-    elif mode == 'continuous_eval':
-
-      def timeout_fn():
-        if evaluator.global_step.numpy() >= params.trainer.train_steps:
-          return True
-        return False
-
-      controller.evaluate_continuously(
-          steps=params.trainer.validation_steps,
-          timeout=params.trainer.continuous_eval_timeout,
-          timeout_fn=timeout_fn)
-    else:
-      raise NotImplementedError('The mode is not implemented: %s' % mode)
-
-    return model
-
-
-def run_experiment_with_multitask_eval(
-    *,
-    distribution_strategy: tf.distribute.Strategy,
-    train_task: base_task.Task,
-    eval_tasks: multitask.MultiTask,
-    mode: str,
-    params: configs.MultiEvalExperimentConfig,
-    model_dir: str,
-    run_post_eval: bool = False,
-    save_summary: bool = True,
-    trainer: Optional[core_lib.Trainer] = None) -> tf.keras.Model:
-  """Runs train/eval configured by the experiment params.
-
-  Args:
-    distribution_strategy: A distribution distribution_strategy.
-    train_task: A base_task.Task instance.
-    eval_tasks: A multitask.MultiTask with evaluation tasks.
-    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
-      or 'continuous_eval'.
-    params: MultiEvalExperimentConfig instance.
-    model_dir: A 'str', a path to store model checkpoints and summaries.
-    run_post_eval: Whether to run post eval once after training, metrics logs
-      are returned.
-    save_summary: Whether to save train and validation summary.
-    trainer: the core_lib.Trainer instance. It should be created within the
-      strategy.scope(). If not provided, an instance will be created by default
-      if `mode` contains 'train'.
-
-  Returns:
-      model: `tf.keras.Model` instance.
-  """
-
-  is_training = 'train' in mode
-  is_eval = 'eval' in mode
-  with distribution_strategy.scope():
-    if is_training:
-      trainer = trainer or core_lib.Trainer(
-          config=params,
-          task=train_task,
-          model=train_task.build_model(),
-          optimizer=train_task.create_optimizer(
-              params.trainer.optimizer_config, params.runtime),
-          train=True,
-          evaluate=False)
-    else:
-      trainer = None
-    model = trainer.model if trainer else train_task.build_model()
-
-    if is_eval:
-      evaluator = evaluator_lib.MultiTaskEvaluator(
-          task=eval_tasks,
-          model=model,
-          global_step=trainer.global_step if is_training else None,
-          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
-              params, model_dir))
-    else:
-      evaluator = None
-
-  if trainer:
-    checkpoint = trainer.checkpoint
-    global_step = trainer.global_step
-  else:
-    checkpoint = evaluator.checkpoint
-    global_step = evaluator.global_step
-
-  checkpoint_manager = tf.train.CheckpointManager(
-      checkpoint,
-      directory=model_dir,
-      max_to_keep=params.trainer.max_to_keep,
-      step_counter=global_step,
-      checkpoint_interval=params.trainer.checkpoint_interval,
-      init_fn=trainer.initialize if trainer else None)
-
-  controller = orbit.Controller(
-      strategy=distribution_strategy,
-      trainer=trainer,
-      evaluator=evaluator,
-      global_step=global_step,
-      steps_per_loop=params.trainer.steps_per_loop,
-      checkpoint_manager=checkpoint_manager,
-      summary_dir=os.path.join(model_dir, 'train') if save_summary else None,
-      eval_summary_dir=os.path.join(model_dir, 'validation') if
-      (save_summary) else None,
-      summary_interval=params.trainer.summary_interval if
-      (save_summary) else None)
-
-  logging.info('Starts to execute mode: %s', mode)
-  with distribution_strategy.scope():
-    if mode == 'train':
-      controller.train(steps=params.trainer.train_steps)
-    elif mode == 'train_and_eval':
-      controller.train_and_evaluate(
-          train_steps=params.trainer.train_steps,
-          eval_steps=params.trainer.validation_steps,
-          eval_interval=params.trainer.validation_interval)
-    elif mode == 'eval':
-      controller.evaluate(steps=params.trainer.validation_steps)
-    elif mode == 'continuous_eval':
-
-      def timeout_fn():
-        if evaluator.global_step.numpy() >= params.trainer.train_steps:
-          return True
-        return False
-
-      controller.evaluate_continuously(
-          steps=params.trainer.validation_steps,
-          timeout=params.trainer.continuous_eval_timeout,
-          timeout_fn=timeout_fn)
-    else:
-      raise NotImplementedError('The mode is not implemented: %s' % mode)
-
-    if run_post_eval:
-      return model, evaluator.evaluate(
-          tf.convert_to_tensor(params.trainer.validation_steps))
-    else:
-      return model, {}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/train_lib_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/train_lib_test.py
deleted file mode 100644
index e145e95b2494a6f77703f0c91f555746da265e20..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/multitask/train_lib_test.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for multitask.train_lib."""
-from absl.testing import parameterized
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.core import task_factory
-from official.modeling.hyperparams import params_dict
-from official.modeling.multitask import configs
-from official.modeling.multitask import multitask
-from official.modeling.multitask import test_utils
-from official.modeling.multitask import train_lib
-
-
-class TrainLibTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self._test_config = {
-        'trainer': {
-            'checkpoint_interval': 10,
-            'steps_per_loop': 10,
-            'summary_interval': 10,
-            'train_steps': 10,
-            'validation_steps': 5,
-            'validation_interval': 10,
-            'continuous_eval_timeout': 1,
-            'optimizer_config': {
-                'optimizer': {
-                    'type': 'sgd',
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                }
-            }
-        },
-    }
-
-  @combinations.generate(
-      combinations.combine(
-          distribution_strategy=[
-              strategy_combinations.default_strategy,
-              strategy_combinations.cloud_tpu_strategy,
-              strategy_combinations.one_device_strategy_gpu,
-          ],
-          mode='eager',
-          flag_mode=['train', 'eval', 'train_and_eval']))
-  def test_end_to_end(self, distribution_strategy, flag_mode):
-    model_dir = self.get_temp_dir()
-    experiment_config = configs.MultiTaskExperimentConfig(
-        task=configs.MultiTaskConfig(
-            task_routines=(
-                configs.TaskRoutine(
-                    task_name='foo',
-                    task_config=test_utils.FooConfig()),
-                configs.TaskRoutine(
-                    task_name='bar', task_config=test_utils.BarConfig()))))
-    experiment_config = params_dict.override_params_dict(
-        experiment_config, self._test_config, is_strict=False)
-    with distribution_strategy.scope():
-      test_multitask = multitask.MultiTask.from_config(experiment_config.task)
-      model = test_utils.MockMultiTaskModel()
-    train_lib.run_experiment(
-        distribution_strategy=distribution_strategy,
-        task=test_multitask,
-        model=model,
-        mode=flag_mode,
-        params=experiment_config,
-        model_dir=model_dir)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution_strategy=[
-              strategy_combinations.default_strategy,
-              strategy_combinations.cloud_tpu_strategy,
-              strategy_combinations.one_device_strategy_gpu,
-          ],
-          mode='eager',
-          flag_mode=['train', 'eval', 'train_and_eval']))
-  def test_end_to_end_multi_eval(self, distribution_strategy, flag_mode):
-    model_dir = self.get_temp_dir()
-    experiment_config = configs.MultiEvalExperimentConfig(
-        task=test_utils.FooConfig(),
-        eval_tasks=configs.MultiTaskConfig(
-            task_routines=(
-                configs.TaskRoutine(
-                    task_name='foo',
-                    task_config=test_utils.FooConfig()),
-                configs.TaskRoutine(
-                    task_name='bar', task_config=test_utils.BarConfig()))))
-    experiment_config = params_dict.override_params_dict(
-        experiment_config, self._test_config, is_strict=False)
-    with distribution_strategy.scope():
-      train_task = task_factory.get_task(experiment_config.task)
-      eval_tasks = multitask.MultiTask.from_config(experiment_config.eval_tasks)
-    train_lib.run_experiment_with_multitask_eval(
-        distribution_strategy=distribution_strategy,
-        train_task=train_task,
-        eval_tasks=eval_tasks,
-        mode=flag_mode,
-        params=experiment_config,
-        model_dir=model_dir)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/__init__.py
deleted file mode 100644
index 8b71d02a775d52a95fbfc00b3f50a53e1a0e9dbe..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Optimization package definition."""
-
-# pylint: disable=wildcard-import
-from official.modeling.optimization.configs.learning_rate_config import *
-from official.modeling.optimization.configs.optimization_config import *
-from official.modeling.optimization.configs.optimizer_config import *
-from official.modeling.optimization.ema_optimizer import ExponentialMovingAverage
-from official.modeling.optimization.lr_schedule import *
-from official.modeling.optimization.optimizer_factory import OptimizerFactory
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/learning_rate_config.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/learning_rate_config.py
deleted file mode 100644
index 52e152e561ecfb4be2b8609be32b097ec51396e8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/learning_rate_config.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Dataclasses for learning rate schedule config."""
-from typing import List, Optional
-
-import dataclasses
-from official.modeling.hyperparams import base_config
-
-
-@dataclasses.dataclass
-class ConstantLrConfig(base_config.Config):
-  """Configuration for constant learning rate.
-
-  This class is a containers for the constant learning rate decay configs.
-
-  Attributes:
-    name: The name of the learning rate schedule. Defaults to Constant.
-    learning_rate: A float. The learning rate. Defaults to 0.1.
-  """
-  name: str = 'Constant'
-  learning_rate: float = 0.1
-
-
-@dataclasses.dataclass
-class StepwiseLrConfig(base_config.Config):
-  """Configuration for stepwise learning rate decay.
-
-  This class is a container for the piecewise constant learning rate scheduling
-  configs. It will configure an instance of PiecewiseConstantDecay keras
-  learning rate schedule.
-
-  An example (from keras docs): use a learning rate that's 1.0 for the first
-  100001 steps, 0.5 for the next 10000 steps, and 0.1 for any additional steps.
-    ```python
-    boundaries: [100000, 110000]
-    values: [1.0, 0.5, 0.1]
-
-  Attributes:
-    name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
-    boundaries: A list of ints of strictly increasing entries. Defaults to None.
-    values: A list of floats that specifies the values for the intervals defined
-      by `boundaries`. It should have one more element than `boundaries`.
-            The learning rate is computed as follows: [0, boundaries[0]] ->
-              values[0] [boundaries[0], boundaries[1]]     -> values[1]
-              [boundaries[n-1], boundaries[n]]   -> values[n] [boundaries[n],
-              end]               -> values[n+1] Defaults to None.
-  """
-  name: str = 'PiecewiseConstantDecay'
-  boundaries: Optional[List[int]] = None
-  values: Optional[List[float]] = None
-
-
-@dataclasses.dataclass
-class ExponentialLrConfig(base_config.Config):
-  """Configuration for exponential learning rate decay.
-
-  This class is a containers for the exponential learning rate decay configs.
-
-  Attributes:
-    name: The name of the learning rate schedule. Defaults to ExponentialDecay.
-    initial_learning_rate: A float. The initial learning rate. Defaults to None.
-    decay_steps: A positive integer that is used for decay computation. Defaults
-      to None.
-    decay_rate: A float. Defaults to None.
-    staircase: A boolean, if true, learning rate is decreased at discreate
-      intervals. Defaults to False.
-  """
-  name: str = 'ExponentialDecay'
-  initial_learning_rate: Optional[float] = None
-  decay_steps: Optional[int] = None
-  decay_rate: Optional[float] = None
-  staircase: Optional[bool] = None
-
-
-@dataclasses.dataclass
-class PolynomialLrConfig(base_config.Config):
-  """Configuration for polynomial learning rate decay.
-
-  This class is a containers for the polynomial learning rate decay configs.
-
-  Attributes:
-    name: The name of the learning rate schedule. Defaults to PolynomialDecay.
-    initial_learning_rate: A float. The initial learning rate. Defaults to None.
-    decay_steps: A positive integer that is used for decay computation. Defaults
-      to None.
-    end_learning_rate: A float.  The minimal end learning rate.
-    power: A float.  The power of the polynomial. Defaults to linear, 1.0.
-    cycle: A boolean, whether or not it should cycle beyond decay_steps.
-      Defaults to False.
-  """
-  name: str = 'PolynomialDecay'
-  initial_learning_rate: Optional[float] = None
-  decay_steps: Optional[int] = None
-  end_learning_rate: float = 0.0001
-  power: float = 1.0
-  cycle: bool = False
-
-
-@dataclasses.dataclass
-class CosineLrConfig(base_config.Config):
-  """Configuration for Cosine learning rate decay.
-
-  This class is a containers for the cosine learning rate decay configs,
-  tf.keras.experimental.CosineDecay.
-
-  Attributes:
-    name: The name of the learning rate schedule. Defaults to CosineDecay.
-    initial_learning_rate: A float. The initial learning rate. Defaults to None.
-    decay_steps: A positive integer that is used for decay computation. Defaults
-      to None.
-    alpha: A float.  Minimum learning rate value as a fraction of
-      initial_learning_rate.
-  """
-  name: str = 'CosineDecay'
-  initial_learning_rate: Optional[float] = None
-  decay_steps: Optional[int] = None
-  alpha: float = 0.0
-
-
-@dataclasses.dataclass
-class DirectPowerLrConfig(base_config.Config):
-  """Configuration for DirectPower learning rate decay.
-
-  This class configures a schedule following follows lr * (step)^power.
-
-  Attributes:
-    name: The name of the learning rate schedule. Defaults to DirectPowerDecay.
-    initial_learning_rate: A float. The initial learning rate. Defaults to None.
-    power: A float. Defaults to -0.5, for sqrt decay.
-  """
-  name: str = 'DirectPowerDecay'
-  initial_learning_rate: Optional[float] = None
-  power: float = -0.5
-
-
-@dataclasses.dataclass
-class PowerAndLinearDecayLrConfig(base_config.Config):
-  """Configuration for DirectPower learning rate decay.
-
-  The schedule has the following behavoir.
-  Let offset_step = step - offset.
-  1) offset_step < 0, the actual learning rate equals initial_learning_rate.
-  2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
-  actual learning rate equals lr * offset_step^power.
-  3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
-  total_decay_steps, the actual learning rate equals lr * offset_step^power *
-  (total_decay_steps - offset_step) / (total_decay_steps *
-  linear_decay_fraction).
-  4) offset_step >= total_decay_steps, the actual learning rate equals zero.
-
-  Attributes:
-    name: The name of the learning rate schedule. Defaults to
-      PowerAndLinearDecay.
-    initial_learning_rate: A float. The initial learning rate. Defaults to None.
-    total_decay_steps: An int. The total number of steps for power + linear
-      decay. Defaults to None.
-    power: A float. The order of the polynomial. Defaults to -0.5, for sqrt
-      decay.
-    linear_decay_fraction: A float. In the last `linear_decay_fraction` steps,
-      the learning rate will be multiplied by a linear decay. Defaults to 0.1.
-    offset: An int. The offset applied to steps. Defaults to 0.
-  """
-  name: str = 'PowerAndLinearDecay'
-  initial_learning_rate: Optional[float] = None
-  total_decay_steps: Optional[int] = None
-  power: float = -0.5
-  linear_decay_fraction: float = 0.1
-  offset: int = 0
-
-
-@dataclasses.dataclass
-class PowerDecayWithOffsetLrConfig(base_config.Config):
-  """Configuration for power learning rate decay with step offset.
-
-  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
-  Otherwise, learning rate equals to lr * (step - offset)^power.
-
-  Attributes:
-    name: The name of the learning rate schedule. Defaults to
-      PowerDecayWithOffset.
-    initial_learning_rate: A float. The initial learning rate. Defaults to None.
-    power: A float. Defaults to -0.5, for sqrt decay.
-    offset: An integer. Power decay happens after `offset` steps.
-    pre_offset_learning_rate: A float. The constant learning rate before
-      `offset` steps.
-  """
-  name: str = 'PowerDecayWithOffset'
-  initial_learning_rate: Optional[float] = None
-  power: float = -0.5
-  offset: int = 0
-  pre_offset_learning_rate: float = 1.0e6
-
-
-@dataclasses.dataclass
-class LinearWarmupConfig(base_config.Config):
-  """Configuration for linear warmup schedule config.
-
-  This class is a container for the linear warmup schedule configs.
-  Warmup_learning_rate is the initial learning rate, the final learning rate of
-  the warmup period is the learning_rate of the optimizer in use. The learning
-  rate at each step linearly increased according to the following formula:
-    warmup_learning_rate = warmup_learning_rate +
-    step / warmup_steps * (final_learning_rate - warmup_learning_rate).
-  Using warmup overrides the learning rate schedule by the number of warmup
-  steps.
-
-  Attributes:
-    name: The name of warmup schedule. Defaults to linear.
-    warmup_learning_rate: Initial learning rate for the warmup. Defaults to 0.
-    warmup_steps: Warmup steps. Defaults to None.
-  """
-  name: str = 'linear'
-  warmup_learning_rate: float = 0
-  warmup_steps: Optional[int] = None
-
-
-@dataclasses.dataclass
-class PolynomialWarmupConfig(base_config.Config):
-  """Configuration for linear warmup schedule config.
-
-  This class is a container for the polynomial warmup schedule configs.
-
-  Attributes:
-    name: The name of warmup schedule. Defaults to Polynomial.
-    power: Polynomial power. Defaults to 1.
-    warmup_steps: Warmup steps. Defaults to None.
-  """
-  name: str = 'polynomial'
-  power: float = 1
-  warmup_steps: Optional[int] = None
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/optimization_config.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/optimization_config.py
deleted file mode 100644
index 61ec8d7b4254bbe515f68ae48991c820d7e14816..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/optimization_config.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Dataclasses for optimization configs.
-
-This file define the dataclass for optimization configs (OptimizationConfig).
-It also has two helper functions get_optimizer_config, and get_lr_config from
-an OptimizationConfig class.
-"""
-from typing import Optional
-
-import dataclasses
-
-from official.modeling.hyperparams import base_config
-from official.modeling.hyperparams import oneof
-from official.modeling.optimization.configs import learning_rate_config as lr_cfg
-from official.modeling.optimization.configs import optimizer_config as opt_cfg
-
-
-@dataclasses.dataclass
-class OptimizerConfig(oneof.OneOfConfig):
-  """Configuration for optimizer.
-
-  Attributes:
-    type: 'str', type of optimizer to be used, on the of fields below.
-    sgd: sgd optimizer config.
-    adam: adam optimizer config.
-    adamw: adam with weight decay.
-    lamb: lamb optimizer.
-    rmsprop: rmsprop optimizer.
-    lars: lars optimizer.
-    adagrad: adagrad optimizer.
-  """
-  type: Optional[str] = None
-  sgd: opt_cfg.SGDConfig = opt_cfg.SGDConfig()
-  adam: opt_cfg.AdamConfig = opt_cfg.AdamConfig()
-  adamw: opt_cfg.AdamWeightDecayConfig = opt_cfg.AdamWeightDecayConfig()
-  lamb: opt_cfg.LAMBConfig = opt_cfg.LAMBConfig()
-  rmsprop: opt_cfg.RMSPropConfig = opt_cfg.RMSPropConfig()
-  lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()
-  adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig()
-
-
-@dataclasses.dataclass
-class LrConfig(oneof.OneOfConfig):
-  """Configuration for lr schedule.
-
-  Attributes:
-    type: 'str', type of lr schedule to be used, one of the fields below.
-    constant: constant learning rate config.
-    stepwise: stepwise learning rate config.
-    exponential: exponential learning rate config.
-    polynomial: polynomial learning rate config.
-    cosine: cosine learning rate config.
-    power: step^power learning rate config.
-    power_linear: learning rate config of step^power followed by
-      step^power*linear.
-    power_with_offset: power decay with a step offset.
-  """
-  type: Optional[str] = None
-  constant: lr_cfg.ConstantLrConfig = lr_cfg.ConstantLrConfig()
-  stepwise: lr_cfg.StepwiseLrConfig = lr_cfg.StepwiseLrConfig()
-  exponential: lr_cfg.ExponentialLrConfig = lr_cfg.ExponentialLrConfig()
-  polynomial: lr_cfg.PolynomialLrConfig = lr_cfg.PolynomialLrConfig()
-  cosine: lr_cfg.CosineLrConfig = lr_cfg.CosineLrConfig()
-  power: lr_cfg.DirectPowerLrConfig = lr_cfg.DirectPowerLrConfig()
-  power_linear: lr_cfg.PowerAndLinearDecayLrConfig = (
-      lr_cfg.PowerAndLinearDecayLrConfig())
-  power_with_offset: lr_cfg.PowerDecayWithOffsetLrConfig = (
-      lr_cfg.PowerDecayWithOffsetLrConfig())
-
-
-@dataclasses.dataclass
-class WarmupConfig(oneof.OneOfConfig):
-  """Configuration for lr schedule.
-
-  Attributes:
-    type: 'str', type of warmup schedule to be used, one of the fields below.
-    linear: linear warmup config.
-    polynomial: polynomial warmup config.
-  """
-  type: Optional[str] = None
-  linear: lr_cfg.LinearWarmupConfig = lr_cfg.LinearWarmupConfig()
-  polynomial: lr_cfg.PolynomialWarmupConfig = lr_cfg.PolynomialWarmupConfig()
-
-
-@dataclasses.dataclass
-class OptimizationConfig(base_config.Config):
-  """Configuration for optimizer and learning rate schedule.
-
-  Attributes:
-    optimizer: optimizer oneof config.
-    ema: optional exponential moving average optimizer config, if specified, ema
-      optimizer will be used.
-    learning_rate: learning rate oneof config.
-    warmup: warmup oneof config.
-  """
-  optimizer: OptimizerConfig = OptimizerConfig()
-  ema: Optional[opt_cfg.EMAConfig] = None
-  learning_rate: LrConfig = LrConfig()
-  warmup: WarmupConfig = WarmupConfig()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/optimization_config_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/optimization_config_test.py
deleted file mode 100644
index a4e31078d1a56eed617129b3e6cfbd6a43df105b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/optimization_config_test.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for optimization_config.py."""
-
-import tensorflow as tf
-
-from official.modeling.optimization.configs import learning_rate_config as lr_cfg
-from official.modeling.optimization.configs import optimization_config
-from official.modeling.optimization.configs import optimizer_config as opt_cfg
-
-
-class OptimizerConfigTest(tf.test.TestCase):
-
-  def test_no_optimizer(self):
-    optimizer = optimization_config.OptimizationConfig({}).optimizer.get()
-    self.assertIsNone(optimizer)
-
-  def test_no_lr_schedule(self):
-    lr = optimization_config.OptimizationConfig({}).learning_rate.get()
-    self.assertIsNone(lr)
-
-  def test_no_warmup_schedule(self):
-    warmup = optimization_config.OptimizationConfig({}).warmup.get()
-    self.assertIsNone(warmup)
-
-  def test_config(self):
-    opt_config = optimization_config.OptimizationConfig({
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {}  # default config
-        },
-        'learning_rate': {
-            'type': 'polynomial',
-            'polynomial': {}
-        },
-        'warmup': {
-            'type': 'linear'
-        }
-    })
-    self.assertEqual(opt_config.optimizer.get(), opt_cfg.SGDConfig())
-    self.assertEqual(opt_config.learning_rate.get(),
-                     lr_cfg.PolynomialLrConfig())
-    self.assertEqual(opt_config.warmup.get(), lr_cfg.LinearWarmupConfig())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/optimizer_config.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/optimizer_config.py
deleted file mode 100644
index a14c1f2075b41a9623bec68d07323e0042c567c3..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/configs/optimizer_config.py
+++ /dev/null
@@ -1,240 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Dataclasses for optimizer configs."""
-from typing import List, Optional
-
-import dataclasses
-from official.modeling.hyperparams import base_config
-
-
-@dataclasses.dataclass
-class BaseOptimizerConfig(base_config.Config):
-  """Base optimizer config.
-
-  Attributes:
-    clipnorm: float >= 0 or None. If not None, Gradients will be clipped when
-      their L2 norm exceeds this value.
-    clipvalue: float >= 0 or None. If not None, Gradients will be clipped when
-      their absolute value exceeds this value.
-    global_clipnorm: float >= 0 or None. If not None, gradient of all weights is
-      clipped so that their global norm is no higher than this value
-  """
-  clipnorm: Optional[float] = None
-  clipvalue: Optional[float] = None
-  global_clipnorm: Optional[float] = None
-
-
-@dataclasses.dataclass
-class SGDConfig(BaseOptimizerConfig):
-  """Configuration for SGD optimizer.
-
-  The attributes for this class matches the arguments of tf.keras.optimizer.SGD.
-
-  Attributes:
-    name: name of the optimizer.
-    decay: decay rate for SGD optimizer.
-    nesterov: nesterov for SGD optimizer.
-    momentum: momentum for SGD optimizer.
-  """
-  name: str = "SGD"
-  decay: float = 0.0
-  nesterov: bool = False
-  momentum: float = 0.0
-
-
-@dataclasses.dataclass
-class RMSPropConfig(BaseOptimizerConfig):
-  """Configuration for RMSProp optimizer.
-
-  The attributes for this class matches the arguments of
-  tf.keras.optimizers.RMSprop.
-
-  Attributes:
-    name: name of the optimizer.
-    rho: discounting factor for RMSprop optimizer.
-    momentum: momentum for RMSprop optimizer.
-    epsilon: epsilon value for RMSprop optimizer, help with numerical stability.
-    centered: Whether to normalize gradients or not.
-  """
-  name: str = "RMSprop"
-  rho: float = 0.9
-  momentum: float = 0.0
-  epsilon: float = 1e-7
-  centered: bool = False
-
-
-@dataclasses.dataclass
-class AdagradConfig(BaseOptimizerConfig):
-  """Configuration for Adagrad optimizer.
-
-  The attributes of this class match the arguments of
-  tf.keras.optimizer.Adagrad.
-
-  Attributes:
-    name: name of the optimizer.
-    initial_accumulator_value: A floating point value. Starting value for the
-      accumulators, must be non-negative.
-    epsilon: A small floating point value to avoid zero denominator.
-  """
-  name: str = "Adagrad"
-  initial_accumulator_value: float = 0.1
-  epsilon: float = 1e-07
-
-
-@dataclasses.dataclass
-class AdamConfig(BaseOptimizerConfig):
-  """Configuration for Adam optimizer.
-
-  The attributes for this class matches the arguments of
-  tf.keras.optimizer.Adam.
-
-  Attributes:
-    name: name of the optimizer.
-    beta_1: decay rate for 1st order moments.
-    beta_2: decay rate for 2st order moments.
-    epsilon: epsilon value used for numerical stability in Adam optimizer.
-    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
-      the paper "On the Convergence of Adam and beyond".
-  """
-  name: str = "Adam"
-  beta_1: float = 0.9
-  beta_2: float = 0.999
-  epsilon: float = 1e-07
-  amsgrad: bool = False
-
-
-@dataclasses.dataclass
-class AdamWeightDecayConfig(BaseOptimizerConfig):
-  """Configuration for Adam optimizer with weight decay.
-
-  Attributes:
-    name: name of the optimizer.
-    beta_1: decay rate for 1st order moments.
-    beta_2: decay rate for 2st order moments.
-    epsilon: epsilon value used for numerical stability in the optimizer.
-    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
-      the paper "On the Convergence of Adam and beyond".
-    weight_decay_rate: float. Weight decay rate. Default to 0.
-    include_in_weight_decay: list[str], or None. List of weight names to include
-      in weight decay.
-    exclude_from_weight_decay: list[str], or None. List of weight names to not
-      include in weight decay.
-    gradient_clip_norm: A positive float. Clips the gradients to this maximum
-      L2-norm. Default to 1.0.
-  """
-  name: str = "AdamWeightDecay"
-  beta_1: float = 0.9
-  beta_2: float = 0.999
-  epsilon: float = 1e-07
-  amsgrad: bool = False
-  weight_decay_rate: float = 0.0
-  include_in_weight_decay: Optional[List[str]] = None
-  exclude_from_weight_decay: Optional[List[str]] = None
-  gradient_clip_norm: float = 1.0
-
-
-@dataclasses.dataclass
-class LAMBConfig(BaseOptimizerConfig):
-  """Configuration for LAMB optimizer.
-
-  The attributes for this class matches the arguments of
-  tensorflow_addons.optimizers.LAMB.
-
-  Attributes:
-    name: name of the optimizer.
-    beta_1: decay rate for 1st order moments.
-    beta_2: decay rate for 2st order moments.
-    epsilon: epsilon value used for numerical stability in LAMB optimizer.
-    weight_decay_rate: float. Weight decay rate. Default to 0.
-    exclude_from_weight_decay: List of regex patterns of variables excluded from
-      weight decay. Variables whose name contain a substring matching the
-      pattern will be excluded.
-    exclude_from_layer_adaptation: List of regex patterns of variables excluded
-      from layer adaptation. Variables whose name contain a substring matching
-      the pattern will be excluded.
-  """
-  name: str = "LAMB"
-  beta_1: float = 0.9
-  beta_2: float = 0.999
-  epsilon: float = 1e-6
-  weight_decay_rate: float = 0.0
-  exclude_from_weight_decay: Optional[List[str]] = None
-  exclude_from_layer_adaptation: Optional[List[str]] = None
-
-
-@dataclasses.dataclass
-class EMAConfig(BaseOptimizerConfig):
-  """Exponential moving average optimizer config.
-
-  Attributes:
-    name: 'str', name of the optimizer.
-    average_decay: 'float', average decay value.
-    start_step: 'int', start step to apply moving average.
-    dynamic_decay: 'bool', whether to apply dynamic decay or not.
-  """
-  name: str = "ExponentialMovingAverage"
-  average_decay: float = 0.99
-  start_step: int = 0
-  dynamic_decay: bool = True
-
-
-@dataclasses.dataclass
-class LARSConfig(BaseOptimizerConfig):
-  """Layer-wise adaptive rate scaling config.
-
-  Attributes:
-    name: 'str', name of the optimizer.
-    momentum: `float` hyperparameter >= 0 that accelerates gradient descent in
-      the relevant direction and dampens oscillations. Defaults to 0.9.
-    eeta: `float` LARS coefficient as used in the paper. Default set to LARS
-      coefficient from the paper. (eeta / weight_decay) determines the highest
-      scaling factor in LARS..
-    weight_decay_rate: `float` for weight decay.
-    nesterov: 'boolean' for whether to use nesterov momentum.
-    classic_momentum: `boolean` for whether to use classic (or popular)
-      momentum. The learning rate is applied during momentum update in classic
-      momentum, but after momentum for popular momentum.
-    exclude_from_weight_decay: A list of `string` for variable screening, if any
-      of the string appears in a variable's name, the variable will be excluded
-      for computing weight decay. For example, one could specify the list like
-      ['batch_normalization', 'bias'] to exclude BN and bias from weight decay.
-    exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but for
-      layer adaptation. If it is None, it will be defaulted the same as
-      exclude_from_weight_decay.
-  """
-  name: str = "LARS"
-  momentum: float = 0.9
-  eeta: float = 0.001
-  weight_decay_rate: float = 0.0
-  nesterov: bool = False
-  classic_momentum: bool = True
-  exclude_from_weight_decay: Optional[List[str]] = None
-  exclude_from_layer_adaptation: Optional[List[str]] = None
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/ema_optimizer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/ema_optimizer.py
deleted file mode 100644
index d4eab40d4d1d5e81989a50e605df02b2d643f44c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/ema_optimizer.py
+++ /dev/null
@@ -1,261 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Exponential moving average optimizer."""
-
-from typing import Text, List
-
-import tensorflow as tf
-
-# pylint: disable=protected-access
-
-
-class ExponentialMovingAverage(tf.keras.optimizers.Optimizer):
-  """Optimizer that computes an exponential moving average of the variables.
-
-  Empirically it has been found that using the moving average of the trained
-  parameters of a deep network is better than using its trained parameters
-  directly. This optimizer allows you to compute this moving average and swap
-  the variables at save time so that any code outside of the training loop
-  will use by default the average values instead of the original ones.
-
-  Example of usage for training:
-  ```python
-  opt = tf.keras.optimizers.SGD(learning_rate)
-  opt = ExponentialMovingAverage(opt)
-
-  opt.shadow_copy(model)
-  ```
-
-  At test time, swap the shadow variables to evaluate on the averaged weights:
-  ```python
-  opt.swap_weights()
-  # Test eval the model here
-  opt.swap_weights()
-  ```
-  """
-
-  def __init__(self,
-               optimizer: tf.keras.optimizers.Optimizer,
-               average_decay: float = 0.99,
-               start_step: int = 0,
-               dynamic_decay: bool = True,
-               name: Text = 'ExponentialMovingAverage',
-               **kwargs):
-    """Construct a new ExponentialMovingAverage optimizer.
-
-    Args:
-      optimizer: `tf.keras.optimizers.Optimizer` that will be
-        used to compute and apply gradients.
-      average_decay: float. Decay to use to maintain the moving averages
-        of trained variables.
-      start_step: int. What step to start the moving average.
-      dynamic_decay: bool. Whether to change the decay based on the number
-        of optimizer updates. Decay will start at 0.1 and gradually increase
-        up to `average_decay` after each optimizer update. This behavior is
-        similar to `tf.train.ExponentialMovingAverage` in TF 1.x.
-      name: Optional name for the operations created when applying
-        gradients. Defaults to "moving_average".
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`,
-        `clipvalue`, `lr`, `decay`}.
-    """
-    super().__init__(name, **kwargs)
-    self._average_decay = average_decay
-    self._start_step = tf.constant(start_step, tf.float32)
-    self._dynamic_decay = dynamic_decay
-    self._optimizer = optimizer
-    self._track_trackable(self._optimizer, 'base_optimizer')
-    self._average_weights = None
-    self._model_weights = None
-
-  def shadow_copy(self, model: tf.keras.Model):
-    """Creates shadow variables for the given model weights."""
-    for var in model.weights:
-      self.add_slot(var, 'average', initializer='zeros')
-    self._average_weights = [
-        self.get_slot(var, 'average') for var in model.weights
-    ]
-    self._model_weights = model.weights
-
-  @property
-  def has_shadow_copy(self):
-    """Whether this optimizer has created shadow variables."""
-    return self._model_weights is not None and self._average_weights is not None
-
-  def _create_slots(self, var_list):
-    self._optimizer._create_slots(var_list=var_list)  # pylint: disable=protected-access
-
-  def apply_gradients(self, grads_and_vars, name: Text = None):
-    result = self._optimizer.apply_gradients(grads_and_vars, name)
-    self.update_average(self.iterations)
-    return result
-
-  @tf.function
-  def update_average(self, step: tf.Tensor):
-    step = tf.cast(step, tf.float32)
-    if step < self._start_step:
-      decay = tf.constant(0., tf.float32)
-    elif self._dynamic_decay:
-      decay = step - self._start_step
-      decay = tf.minimum(self._average_decay, (1. + decay) / (10. + decay))
-    else:
-      decay = self._average_decay
-
-    def _apply_moving(v_moving, v_normal):
-      diff = v_moving - v_normal
-      v_moving.assign_sub(tf.cast(1. - decay, v_moving.dtype) * diff)
-      return v_moving
-
-    def _update(strategy, v_moving_and_v_normal):
-      for v_moving, v_normal in v_moving_and_v_normal:
-        strategy.extended.update(v_moving, _apply_moving, args=(v_normal,))
-
-    ctx = tf.distribute.get_replica_context()
-    return ctx.merge_call(_update, args=(zip(self._average_weights,
-                                             self._model_weights),))
-
-  def swap_weights(self):
-    """Swap the average and moving weights.
-
-    This is a convenience method to allow one to evaluate the averaged weights
-    at test time. Loads the weights stored in `self._average` into the model,
-    keeping a copy of the original model weights. Swapping twice will return
-    the original weights.
-    """
-    if tf.distribute.in_cross_replica_context():
-      strategy = tf.distribute.get_strategy()
-      strategy.run(self._swap_weights, args=())
-    else:
-      raise ValueError('Swapping weights must occur under a '
-                       'tf.distribute.Strategy')
-
-  @tf.function
-  def _swap_weights(self):
-    def fn_0(a, b):
-      a.assign_add(b)
-      return a
-    def fn_1(b, a):
-      b.assign(a - b)
-      return b
-    def fn_2(a, b):
-      a.assign_sub(b)
-      return a
-
-    def swap(strategy, a_and_b):
-      """Swap `a` and `b` and mirror to all devices."""
-      for a, b in a_and_b:
-        strategy.extended.update(a, fn_0, args=(b,))  # a = a + b
-        strategy.extended.update(b, fn_1, args=(a,))  # b = a - b
-        strategy.extended.update(a, fn_2, args=(b,))  # a = a - b
-
-    ctx = tf.distribute.get_replica_context()
-    return ctx.merge_call(
-        swap, args=(zip(self._average_weights, self._model_weights),))
-
-  def assign_average_vars(self, var_list: List[tf.Variable]):
-    """Assign variables in var_list with their respective averages.
-
-    Args:
-      var_list: List of model variables to be assigned to their average.
-    Returns:
-      assign_op: The op corresponding to the assignment operation of
-        variables to their average.
-    """
-    assign_op = tf.group([
-        var.assign(self.get_slot(var, 'average')) for var in var_list
-        if var.trainable
-    ])
-    return assign_op
-
-  def _create_hypers(self):
-    self._optimizer._create_hypers()  # pylint: disable=protected-access
-
-  def _prepare(self, var_list):
-    return self._optimizer._prepare(var_list=var_list)  # pylint: disable=protected-access
-
-  @property
-  def iterations(self):
-    return self._optimizer.iterations
-
-  @iterations.setter
-  def iterations(self, variable):
-    self._optimizer.iterations = variable
-
-  @property
-  def weights(self):
-    # return self._weights + self._optimizer.weights
-    return self._optimizer.weights
-
-  def variables(self):
-    return self._weights + [self.iterations]
-
-  @property
-  def lr(self):
-    return self._optimizer._get_hyper('learning_rate')
-
-  @lr.setter
-  def lr(self, lr):
-    self._optimizer._set_hyper('learning_rate', lr)
-
-  @property
-  def learning_rate(self):
-    return self._optimizer._get_hyper('learning_rate')
-
-  @learning_rate.setter
-  def learning_rate(self, learning_rate):  # pylint: disable=redefined-outer-name
-    self._optimizer._set_hyper('learning_rate', learning_rate)
-
-  def _resource_apply_dense(self, grad, var):
-    return self._optimizer._resource_apply_dense(grad, var)
-
-  def _resource_apply_sparse(self, grad, var, indices):
-    return self._optimizer._resource_apply_sparse(grad, var, indices)
-
-  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
-    return self._optimizer._resource_apply_sparse_duplicate_indices(
-        grad, var, indices)
-
-  def get_config(self):
-    config = {
-        'optimizer': tf.keras.optimizers.serialize(self._optimizer),
-        'average_decay': self._average_decay,
-        'start_step': self._start_step,
-        'dynamic_decay': self._dynamic_decay,
-    }
-    base_config = super(ExponentialMovingAverage, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    optimizer = tf.keras.optimizers.deserialize(
-        config.pop('optimizer'),
-        custom_objects=custom_objects,
-    )
-    return cls(optimizer, **config)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/lars_optimizer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/lars_optimizer.py
deleted file mode 100644
index a2e88ba1900b2af43d3802a3586bec4219213e4e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/lars_optimizer.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Layer-wise adaptive rate scaling optimizer."""
-import re
-from typing import Text, List, Optional
-
-import tensorflow as tf
-
-
-# pylint: disable=protected-access
-
-
-class LARS(tf.keras.optimizers.Optimizer):
-  """Layer-wise Adaptive Rate Scaling for large batch training.
-
-  Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
-  I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
-  """
-
-  def __init__(self,
-               learning_rate: float = 0.01,
-               momentum: float = 0.9,
-               weight_decay_rate: float = 0.0,
-               eeta: float = 0.001,
-               nesterov: bool = False,
-               classic_momentum: bool = True,
-               exclude_from_weight_decay: Optional[List[Text]] = None,
-               exclude_from_layer_adaptation: Optional[List[Text]] = None,
-               name: Text = "LARS",
-               **kwargs):
-    """Constructs a LARSOptimizer.
-
-    Args:
-      learning_rate: `float` for learning rate. Defaults to 0.01.
-      momentum: `float` hyperparameter >= 0 that accelerates gradient descent
-          in the relevant direction and dampens oscillations. Defaults to 0.9.
-      weight_decay_rate: `float` for weight decay.
-      eeta: `float` LARS coefficient as used in the paper. Default set to LARS
-          coefficient from the paper. (eeta / weight_decay) determines the
-          highest scaling factor in LARS..
-      nesterov: 'boolean' for whether to use nesterov momentum.
-      classic_momentum: `boolean` for whether to use classic (or popular)
-          momentum. The learning rate is applied during momentum update in
-          classic momentum, but after momentum for popular momentum.
-      exclude_from_weight_decay: A list of `string` for variable screening, if
-          any of the string appears in a variable's name, the variable will be
-          excluded for computing weight decay. For example, one could specify
-          the list like ['batch_normalization', 'bias'] to exclude BN and bias
-          from weight decay.
-      exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but
-          for layer adaptation. If it is None, it will be defaulted the same as
-          exclude_from_weight_decay.
-      name: `Text` as optional name for the operations created when applying
-        gradients. Defaults to "LARS".
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for
-        backward compatibility, recommended to use `learning_rate` instead.
-    """
-    super(LARS, self).__init__(name, **kwargs)
-
-    self._set_hyper("learning_rate", learning_rate)
-    self._set_hyper("decay", self._initial_decay)
-    self.momentum = momentum
-    self.weight_decay_rate = weight_decay_rate
-    self.eeta = eeta
-    self.nesterov = nesterov
-    self.classic_momentum = classic_momentum
-    self.exclude_from_weight_decay = exclude_from_weight_decay
-    # exclude_from_layer_adaptation is set to exclude_from_weight_decay if the
-    # arg is None.
-    if exclude_from_layer_adaptation:
-      self.exclude_from_layer_adaptation = exclude_from_layer_adaptation
-    else:
-      self.exclude_from_layer_adaptation = exclude_from_weight_decay
-
-  def _create_slots(self, var_list):
-    for v in var_list:
-      self.add_slot(v, "momentum")
-
-  def _resource_apply_dense(self, grad, param, apply_state=None):
-    if grad is None or param is None:
-      return tf.no_op()
-
-    var_device, var_dtype = param.device, param.dtype.base_dtype
-    coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
-                    self._fallback_apply_state(var_device, var_dtype))
-    learning_rate = coefficients["lr_t"]
-
-    param_name = param.name
-
-    v = self.get_slot(param, "momentum")
-
-    if self._use_weight_decay(param_name):
-      grad += self.weight_decay_rate * param
-
-    if self.classic_momentum:
-      trust_ratio = 1.0
-      if self._do_layer_adaptation(param_name):
-        w_norm = tf.norm(param, ord=2)
-        g_norm = tf.norm(grad, ord=2)
-        trust_ratio = tf.where(
-            tf.greater(w_norm, 0),
-            tf.where(tf.greater(g_norm, 0), (self.eeta * w_norm / g_norm), 1.0),
-            1.0)
-      scaled_lr = learning_rate * trust_ratio
-
-      next_v = tf.multiply(self.momentum, v) + scaled_lr * grad
-      if self.nesterov:
-        update = tf.multiply(self.momentum, next_v) + scaled_lr * grad
-      else:
-        update = next_v
-      next_param = param - update
-    else:
-      next_v = tf.multiply(self.momentum, v) + grad
-      if self.nesterov:
-        update = tf.multiply(self.momentum, next_v) + grad
-      else:
-        update = next_v
-
-      trust_ratio = 1.0
-      if self._do_layer_adaptation(param_name):
-        w_norm = tf.norm(param, ord=2)
-        v_norm = tf.norm(update, ord=2)
-        trust_ratio = tf.where(
-            tf.greater(w_norm, 0),
-            tf.where(tf.greater(v_norm, 0), (self.eeta * w_norm / v_norm), 1.0),
-            1.0)
-      scaled_lr = trust_ratio * learning_rate
-      next_param = param - scaled_lr * update
-
-    return tf.group(*[
-        param.assign(next_param, use_locking=False),
-        v.assign(next_v, use_locking=False)
-    ])
-
-  def _resource_apply_sparse(self, grad, handle, indices, apply_state):
-    raise NotImplementedError("Applying sparse gradients is not implemented.")
-
-  def _use_weight_decay(self, param_name):
-    """Whether to use L2 weight decay for `param_name`."""
-    if not self.weight_decay_rate:
-      return False
-    if self.exclude_from_weight_decay:
-      for r in self.exclude_from_weight_decay:
-        if re.search(r, param_name) is not None:
-          return False
-    return True
-
-  def _do_layer_adaptation(self, param_name):
-    """Whether to do layer-wise learning rate adaptation for `param_name`."""
-    if self.exclude_from_layer_adaptation:
-      for r in self.exclude_from_layer_adaptation:
-        if re.search(r, param_name) is not None:
-          return False
-    return True
-
-  def get_config(self):
-    config = super(LARS, self).get_config()
-    config.update({
-        "learning_rate": self._serialize_hyperparameter("learning_rate"),
-        "decay": self._serialize_hyperparameter("decay"),
-        "momentum": self.momentum,
-        "classic_momentum": self.classic_momentum,
-        "weight_decay_rate": self.weight_decay_rate,
-        "eeta": self.eeta,
-        "nesterov": self.nesterov,
-    })
-    return config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/lr_schedule.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/lr_schedule.py
deleted file mode 100644
index ff1f6b189a9a2d7b3ce2df950451764eaacf397d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/lr_schedule.py
+++ /dev/null
@@ -1,332 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Learning rate schedule classes."""
-
-from typing import Mapping, Any, Union, Optional
-
-import tensorflow as tf
-
-
-class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Linear warmup schedule."""
-
-  def __init__(self,
-               after_warmup_lr_sched: Union[
-                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
-               warmup_steps: int,
-               warmup_learning_rate: float,
-               name: Optional[str] = None):
-    """Add linear warmup schedule to a learning rate schedule.
-
-    warmup_lr is the initial learning rate, the final learning rate of the
-    init_warmup period is the initial learning rate of lr_schedule in use.
-    The learning rate at each step linearly increased according to the following
-    formula:
-      learning_rate = warmup_lr + step / warmup_steps
-                    * (final_warmup_lr - warmup_lr).
-    Using warmup overrides the learning rate schedule by the number of warmup
-    steps.
-
-    Args:
-      after_warmup_lr_sched: tf.keras.optimizers.schedules .LearningRateSchedule
-        or a constant.
-      warmup_steps: Number of the warmup steps.
-      warmup_learning_rate: Initial learning rate for the warmup.
-      name: Optional, name of warmup schedule.
-    """
-    super().__init__()
-    self._name = name
-    self._after_warmup_lr_sched = after_warmup_lr_sched
-    self._warmup_steps = warmup_steps
-    self._init_warmup_lr = warmup_learning_rate
-    if isinstance(after_warmup_lr_sched,
-                  tf.keras.optimizers.schedules.LearningRateSchedule):
-      self._final_warmup_lr = after_warmup_lr_sched(warmup_steps)
-    else:
-      self._final_warmup_lr = tf.cast(after_warmup_lr_sched, dtype=tf.float32)
-
-  def __call__(self, step: int):
-
-    global_step = tf.cast(step, dtype=tf.float32)
-
-    linear_warmup_lr = (
-        self._init_warmup_lr + global_step / self._warmup_steps *
-        (self._final_warmup_lr - self._init_warmup_lr))
-
-    if isinstance(self._after_warmup_lr_sched,
-                  tf.keras.optimizers.schedules.LearningRateSchedule):
-      after_warmup_lr = self._after_warmup_lr_sched(step)
-    else:
-      after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
-
-    lr = tf.cond(global_step < self._warmup_steps,
-                 lambda: linear_warmup_lr,
-                 lambda: after_warmup_lr)
-    return lr
-
-  def get_config(self) -> Mapping[str, Any]:
-    if isinstance(self._after_warmup_lr_sched,
-                  tf.keras.optimizers.schedules.LearningRateSchedule):
-      config = {
-          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
-    else:
-      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
-
-    config.update({
-        "warmup_steps": self._warmup_steps,
-        "warmup_learning_rate": self._init_warmup_lr,
-        "name": self._name
-    })
-    return config
-
-
-class PolynomialWarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Applies polynomial warmup schedule on a given learning rate decay schedule."""
-
-  def __init__(self,
-               after_warmup_lr_sched: Union[
-                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
-               warmup_steps: int,
-               power: float = 1.0,
-               name: str = "PolynomialWarmup"):
-    super().__init__()
-    if isinstance(after_warmup_lr_sched,
-                  tf.keras.optimizers.schedules.LearningRateSchedule):
-      self._initial_learning_rate = after_warmup_lr_sched(warmup_steps)
-    else:
-      self._initial_learning_rate = tf.cast(
-          after_warmup_lr_sched, dtype=tf.float32)
-
-    self._warmup_steps = warmup_steps
-    self._power = power
-    self._after_warmup_lr_sched = after_warmup_lr_sched
-    self._name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self._name or "PolynomialWarmUp") as name:
-      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
-      # learning rate will be `global_step/num_warmup_steps * init_lr`.
-      global_step_float = tf.cast(step, tf.float32)
-      warmup_steps_float = tf.cast(self._warmup_steps, tf.float32)
-
-      if self._warmup_steps <= 0:
-        warmup_percent_done = 1.0
-      else:
-        # A zero `step` may cause Inf. So make `step` positive.
-        step_non_zero = tf.math.maximum(global_step_float, 1.0)
-        warmup_percent_done = step_non_zero / warmup_steps_float
-
-      warmup_learning_rate = (
-          self._initial_learning_rate *
-          tf.math.pow(warmup_percent_done, self._power))
-
-      if isinstance(self._after_warmup_lr_sched,
-                    tf.keras.optimizers.schedules.LearningRateSchedule):
-        after_warmup_lr = self._after_warmup_lr_sched(step)
-      else:
-        after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
-
-      return tf.cond(
-          global_step_float < warmup_steps_float,
-          lambda: warmup_learning_rate,
-          lambda: after_warmup_lr,
-          name=name)
-
-  def get_config(self) -> Mapping[str, Any]:
-    if isinstance(self._after_warmup_lr_sched,
-                  tf.keras.optimizers.schedules.LearningRateSchedule):
-      config = {
-          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
-    else:
-      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
-
-    config.update({
-        "warmup_steps": self._warmup_steps,
-        "power": self._power,
-        "name": self._name
-    })
-    return config
-
-
-class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Learning rate schedule follows lr * (step)^power."""
-
-  def __init__(self,
-               initial_learning_rate: float,
-               power: float = 1.0,
-               name: str = "DirectPowerDecay"):
-    """Initialize configuration of the learning rate schedule.
-
-    Args:
-      initial_learning_rate: The initial learning rate.
-      power: The order of the polynomial.
-      name: Optional, name of learning rate schedule.
-    """
-    super().__init__()
-    self._initial_learning_rate = initial_learning_rate
-    self._power = power
-    self._name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self._name or "DirectPowerDecay"):
-      step = tf.cast(step, tf.float32)
-      learning_rate = self._initial_learning_rate
-      # A zero `step` may cause Inf. So make `step` positive.
-      step_non_zero = tf.math.maximum(step, 1.0)
-      learning_rate *= tf.math.pow(step_non_zero, self._power)
-      return learning_rate
-
-  def get_config(self):
-    """Get the configuration of the learning rate schedule."""
-    return {
-        "initial_learning_rate": self._initial_learning_rate,
-        "power": self._power,
-        "name": self._name,
-    }
-
-
-class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Learning rate schedule with multiplied by linear decay at the end.
-
-  The schedule has the following behavoir.
-  Let offset_step = step - offset.
-  1) offset_step < 0, the actual learning rate equals initial_learning_rate.
-  2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
-  actual learning rate equals lr * offset_step^power.
-  3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
-  total_decay_steps, the actual learning rate equals lr * offset_step^power *
-  (total_decay_steps - offset_step) / (total_decay_steps *
-  linear_decay_fraction).
-  4) offset_step >= total_decay_steps, the actual learning rate equals zero.
-  """
-
-  def __init__(self,
-               initial_learning_rate: float,
-               total_decay_steps: int,
-               power: float = 1.0,
-               linear_decay_fraction: float = 0.1,
-               offset: int = 0,
-               name: str = "PowerAndLinearDecay"):
-    """Initialize configuration of the learning rate schedule.
-
-    Args:
-      initial_learning_rate: The initial learning rate.
-      total_decay_steps: The total number of steps for power + linear decay.
-      power: The order of the polynomial.
-      linear_decay_fraction: In the last `linear_decay_fraction` steps, the
-        learning rate will be multiplied by a linear decay.
-      offset: The offset applied to steps.
-      name: Optional, name of learning rate schedule.
-    """
-    super().__init__()
-    self._initial_learning_rate = initial_learning_rate
-    self._total_decay_steps = total_decay_steps
-    self._power = power
-    self._linear_decay_fraction = linear_decay_fraction
-    self._offset = offset
-    self._name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self._name or "PowerAndLinearDecay"):
-      step = tf.cast(step - self._offset, tf.float32)
-      learning_rate = self._initial_learning_rate
-      # A zero `step` may cause Inf. So make `step` positive.
-      step_non_zero = tf.math.maximum(step, 1.0)
-      learning_rate *= tf.math.pow(step_non_zero, self._power)
-      if self._total_decay_steps * self._linear_decay_fraction > 0:
-        learning_rate *= tf.minimum(
-            1.0, (self._total_decay_steps - step) /
-            (self._total_decay_steps * self._linear_decay_fraction))
-        learning_rate = tf.maximum(0.0, learning_rate)
-      return learning_rate
-
-  def get_config(self):
-    """Get the configuration of the learning rate schedule."""
-    return {
-        "initial_learning_rate": self._initial_learning_rate,
-        "total_decay_steps": self._total_decay_steps,
-        "power": self._power,
-        "linear_decay_fraction": self._linear_decay_fraction,
-        "offset": self._offset,
-        "name": self._name,
-    }
-
-
-class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Power learning rate decay with offset.
-
-  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
-  Otherwise, learning rate equals to lr * (step - offset)^power.
-  """
-
-  def __init__(self,
-               initial_learning_rate: float,
-               power: float = 1.0,
-               offset: int = 0,
-               pre_offset_learning_rate: float = 1.0e6,
-               name: str = "PowerDecayWithOffset"):
-    """Initialize configuration of the learning rate schedule.
-
-    Args:
-      initial_learning_rate: The initial learning rate.
-      power: The order of the polynomial.
-      offset: The offset when computing the power decay.
-      pre_offset_learning_rate: The maximum learning rate we'll use.
-      name: Optional, name of learning rate schedule.
-    """
-    super().__init__()
-    self._initial_learning_rate = initial_learning_rate
-    self._power = power
-    self._offset = offset
-    self._pre_offset_lr = pre_offset_learning_rate
-    self._name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self._name or "PowerDecayWithOffset"):
-      step = tf.cast(step, tf.float32)
-      lr_after_offset = tf.math.pow(
-          tf.math.maximum(step - self._offset, 1.0), self._power) * (
-              self._initial_learning_rate)
-
-      sign = tf.cast(step > self._offset, tf.float32)
-      lr_combined = (1.0 - sign) * self._pre_offset_lr + sign * lr_after_offset
-      # Power may give infinitely large LR. So cap it with pre_offset_lr.
-      return tf.math.minimum(lr_combined, self._pre_offset_lr)
-
-  def get_config(self):
-    """Get the configuration of the learning rate schedule."""
-    return {
-        "initial_learning_rate": self._initial_learning_rate,
-        "power": self._power,
-        "offset": self._offset,
-        "pre_offset_learning_rate": self._pre_offset_lr,
-        "name": self._name,
-    }
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/lr_schedule_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/lr_schedule_test.py
deleted file mode 100644
index f475de874cb4bc6847afbb1498b888c8b5f3829b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/lr_schedule_test.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for lr_schedule."""
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.modeling.optimization import lr_schedule
-
-
-class PowerAndLinearDecayTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      dict(
-          testcase_name='power_only',
-          init_lr=1.0,
-          power=-1.0,
-          linear_decay_fraction=0.0,
-          total_decay_steps=100,
-          offset=0,
-          expected=[[0, 1.0], [1, 1.0], [40, 1. / 40.], [60, 1. / 60],
-                    [100, 1. / 100]]),
-      dict(
-          testcase_name='linear_only',
-          init_lr=1.0,
-          power=0.0,
-          linear_decay_fraction=1.0,
-          total_decay_steps=100,
-          offset=0,
-          expected=[[0, 1.0], [1, 0.99], [40, 0.6], [60, 0.4], [100, 0.0]]),
-      dict(
-          testcase_name='general',
-          init_lr=1.0,
-          power=-1.0,
-          linear_decay_fraction=0.5,
-          total_decay_steps=100,
-          offset=0,
-          expected=[[0, 1.0], [1, 1.0], [40, 1. / 40.],
-                    [60, 1. / 60. * 0.8], [100, 0.0]]),
-      dict(
-          testcase_name='offset',
-          init_lr=1.0,
-          power=-1.0,
-          linear_decay_fraction=0.5,
-          total_decay_steps=100,
-          offset=90,
-          expected=[[0, 1.0], [90, 1.0], [91, 1.0], [130, 1. / 40.],
-                    [150, 1. / 60. * 0.8], [190, 0.0], [200, 0.0]]),
-  )
-  def test_power_linear_lr_schedule(self, init_lr, power, linear_decay_fraction,
-                                    total_decay_steps, offset, expected):
-    lr = lr_schedule.PowerAndLinearDecay(
-        initial_learning_rate=init_lr,
-        power=power,
-        linear_decay_fraction=linear_decay_fraction,
-        total_decay_steps=total_decay_steps,
-        offset=offset)
-    for step, value in expected:
-      self.assertAlmostEqual(lr(step).numpy(), value)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/optimizer_factory.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/optimizer_factory.py
deleted file mode 100644
index 9cdd6747d713a6a56dc3598665d2e05dde6a2833..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/optimizer_factory.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Optimizer factory class."""
-from typing import Callable, Union
-
-import gin
-import tensorflow as tf
-import tensorflow_addons.optimizers as tfa_optimizers
-
-from official.modeling.optimization import ema_optimizer
-from official.modeling.optimization import lars_optimizer
-from official.modeling.optimization import lr_schedule
-from official.modeling.optimization.configs import optimization_config as opt_cfg
-from official.nlp import optimization as nlp_optimization
-
-OPTIMIZERS_CLS = {
-    'sgd': tf.keras.optimizers.SGD,
-    'adam': tf.keras.optimizers.Adam,
-    'adamw': nlp_optimization.AdamWeightDecay,
-    'lamb': tfa_optimizers.LAMB,
-    'rmsprop': tf.keras.optimizers.RMSprop,
-    'lars': lars_optimizer.LARS,
-    'adagrad': tf.keras.optimizers.Adagrad,
-}
-
-LR_CLS = {
-    'stepwise': tf.keras.optimizers.schedules.PiecewiseConstantDecay,
-    'polynomial': tf.keras.optimizers.schedules.PolynomialDecay,
-    'exponential': tf.keras.optimizers.schedules.ExponentialDecay,
-    'cosine': tf.keras.experimental.CosineDecay,
-    'power': lr_schedule.DirectPowerDecay,
-    'power_linear': lr_schedule.PowerAndLinearDecay,
-    'power_with_offset': lr_schedule.PowerDecayWithOffset,
-}
-
-WARMUP_CLS = {
-    'linear': lr_schedule.LinearWarmup,
-    'polynomial': lr_schedule.PolynomialWarmUp
-}
-
-
-class OptimizerFactory:
-  """Optimizer factory class.
-
-  This class builds learning rate and optimizer based on an optimization config.
-  To use this class, you need to do the following:
-  (1) Define optimization config, this includes optimizer, and learning rate
-      schedule.
-  (2) Initialize the class using the optimization config.
-  (3) Build learning rate.
-  (4) Build optimizer.
-
-  This is a typical example for using this class:
-  params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {'momentum': 0.9}
-        },
-        'learning_rate': {
-            'type': 'stepwise',
-            'stepwise': {'boundaries': [10000, 20000],
-                         'values': [0.1, 0.01, 0.001]}
-        },
-        'warmup': {
-            'type': 'linear',
-            'linear': {'warmup_steps': 500, 'warmup_learning_rate': 0.01}
-        }
-    }
-  opt_config = OptimizationConfig(params)
-  opt_factory = OptimizerFactory(opt_config)
-  lr = opt_factory.build_learning_rate()
-  optimizer = opt_factory.build_optimizer(lr)
-  """
-
-  def __init__(self, config: opt_cfg.OptimizationConfig):
-    """Initializing OptimizerFactory.
-
-    Args:
-      config: OptimizationConfig instance contain optimization config.
-    """
-    self._config = config
-    self._optimizer_config = config.optimizer.get()
-    self._optimizer_type = config.optimizer.type
-
-    self._use_ema = config.ema is not None
-    self._ema_config = config.ema
-
-    if self._optimizer_config is None:
-      raise ValueError('Optimizer type must be specified')
-
-    self._lr_config = config.learning_rate.get()
-    self._lr_type = config.learning_rate.type
-
-    if self._lr_type is None:
-      raise ValueError('Learning rate type must be specified')
-
-    self._warmup_config = config.warmup.get()
-    self._warmup_type = config.warmup.type
-
-  def build_learning_rate(self):
-    """Build learning rate.
-
-    Builds learning rate from config. Learning rate schedule is built according
-    to the learning rate config. If learning rate type is consant,
-    lr_config.learning_rate is returned.
-
-    Returns:
-      tf.keras.optimizers.schedules.LearningRateSchedule instance. If
-      learning rate type is consant, lr_config.learning_rate is returned.
-    """
-    if self._lr_type == 'constant':
-      lr = self._lr_config.learning_rate
-    else:
-      lr = LR_CLS[self._lr_type](**self._lr_config.as_dict())
-
-    if self._warmup_config:
-      lr = WARMUP_CLS[self._warmup_type](lr, **self._warmup_config.as_dict())
-
-    return lr
-
-  @gin.configurable
-  def build_optimizer(
-      self,
-      lr: Union[tf.keras.optimizers.schedules.LearningRateSchedule, float],
-      postprocessor: Callable[[tf.keras.optimizers.Optimizer],
-                              tf.keras.optimizers.Optimizer] = None):
-    """Build optimizer.
-
-    Builds optimizer from config. It takes learning rate as input, and builds
-    the optimizer according to the optimizer config. Typically, the learning
-    rate built using self.build_lr() is passed as an argument to this method.
-
-    Args:
-      lr: A floating point value, or a
-        tf.keras.optimizers.schedules.LearningRateSchedule instance.
-      postprocessor: An optional function for postprocessing the optimizer. It
-        takes an optimizer and returns an optimizer.
-
-    Returns:
-      tf.keras.optimizers.Optimizer instance.
-    """
-
-    optimizer_dict = self._optimizer_config.as_dict()
-    ## Delete clipnorm and clipvalue if None
-    if optimizer_dict['clipnorm'] is None:
-      del optimizer_dict['clipnorm']
-    if optimizer_dict['clipvalue'] is None:
-      del optimizer_dict['clipvalue']
-
-    optimizer_dict['learning_rate'] = lr
-
-    optimizer = OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
-
-    if self._use_ema:
-      optimizer = ema_optimizer.ExponentialMovingAverage(
-          optimizer, **self._ema_config.as_dict())
-    if postprocessor:
-      optimizer = postprocessor(optimizer)
-    assert isinstance(optimizer, tf.keras.optimizers.Optimizer), (
-        'OptimizerFactory.build_optimizer returning a non-optimizer object: '
-        '{}'.format(optimizer))
-
-    return optimizer
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/optimizer_factory_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/optimizer_factory_test.py
deleted file mode 100644
index 52471de8f9fb2905d34ef8e8b9db5306fe9002c0..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/optimization/optimizer_factory_test.py
+++ /dev/null
@@ -1,414 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for optimizer_factory.py."""
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from official.modeling.optimization import optimizer_factory
-from official.modeling.optimization.configs import optimization_config
-
-
-class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters(('sgd'), ('rmsprop'), ('adam'), ('adamw'), ('lamb'),
-                            ('lars'), ('adagrad'))
-  def test_optimizers(self, optimizer_type):
-    params = {
-        'optimizer': {
-            'type': optimizer_type
-        },
-        'learning_rate': {
-            'type': 'constant',
-            'constant': {
-                'learning_rate': 0.1
-            }
-        }
-    }
-    optimizer_cls = optimizer_factory.OPTIMIZERS_CLS[optimizer_type]
-    expected_optimizer_config = optimizer_cls().get_config()
-    expected_optimizer_config['learning_rate'] = 0.1
-
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-    optimizer = opt_factory.build_optimizer(lr, postprocessor=lambda x: x)
-
-    self.assertIsInstance(optimizer, optimizer_cls)
-    self.assertEqual(expected_optimizer_config, optimizer.get_config())
-
-  @parameterized.parameters((None, None), (1.0, None), (None, 1.0))
-  def test_gradient_clipping(self, clipnorm, clipvalue):
-    params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {
-                'clipnorm': clipnorm,
-                'clipvalue': clipvalue
-            }
-        },
-        'learning_rate': {
-            'type': 'constant',
-            'constant': {
-                'learning_rate': 1.0
-            }
-        }
-    }
-
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-    optimizer = opt_factory.build_optimizer(lr)
-
-    var0 = tf.Variable([1.0, 2.0])
-    var1 = tf.Variable([3.0, 4.0])
-
-    grads0 = tf.constant([0.1, 0.1])
-    grads1 = tf.constant([2.0, 3.0])
-
-    grads_and_vars = list(zip([grads0, grads1], [var0, var1]))
-    optimizer.apply_gradients(grads_and_vars)
-
-    self.assertAllClose(np.array([0.9, 1.9]), var0.numpy())
-    if clipvalue is not None:
-      self.assertAllClose(np.array([2.0, 3.0]), var1.numpy())
-    elif clipnorm is not None:
-      self.assertAllClose(np.array([2.4452999, 3.1679497]), var1.numpy())
-    else:
-      self.assertAllClose(np.array([1.0, 1.0]), var1.numpy())
-
-  def test_missing_types(self):
-    params = {'optimizer': {'type': 'sgd', 'sgd': {'momentum': 0.9}}}
-    with self.assertRaises(ValueError):
-      optimizer_factory.OptimizerFactory(
-          optimization_config.OptimizationConfig(params))
-    params = {
-        'learning_rate': {
-            'type': 'stepwise',
-            'stepwise': {
-                'boundaries': [10000, 20000],
-                'values': [0.1, 0.01, 0.001]
-            }
-        }
-    }
-    with self.assertRaises(ValueError):
-      optimizer_factory.OptimizerFactory(
-          optimization_config.OptimizationConfig(params))
-
-
-# TODO(b/187559334) refactor lr_schedule tests into `lr_schedule_test.py`.
-
-  def test_stepwise_lr_schedule(self):
-    params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {
-                'momentum': 0.9
-            }
-        },
-        'learning_rate': {
-            'type': 'stepwise',
-            'stepwise': {
-                'boundaries': [10000, 20000],
-                'values': [0.1, 0.01, 0.001]
-            }
-        }
-    }
-    expected_lr_step_values = [[0, 0.1], [5000, 0.1], [10000, 0.1],
-                               [10001, 0.01], [20000, 0.01], [20001, 0.001]]
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-
-    for step, value in expected_lr_step_values:
-      self.assertAlmostEqual(lr(step).numpy(), value)
-
-  def test_stepwise_lr_with_warmup_schedule(self):
-    params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {
-                'momentum': 0.9
-            }
-        },
-        'learning_rate': {
-            'type': 'stepwise',
-            'stepwise': {
-                'boundaries': [10000, 20000],
-                'values': [0.1, 0.01, 0.001]
-            }
-        },
-        'warmup': {
-            'type': 'linear',
-            'linear': {
-                'warmup_steps': 500,
-                'warmup_learning_rate': 0.01
-            }
-        }
-    }
-    expected_lr_step_values = [[0, 0.01], [250, 0.055], [500, 0.1], [5500, 0.1],
-                               [10000, 0.1], [10001, 0.01], [20000, 0.01],
-                               [20001, 0.001]]
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-
-    for step, value in expected_lr_step_values:
-      self.assertAlmostEqual(lr(step).numpy(), value)
-
-  def test_exponential_lr_schedule(self):
-    params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {
-                'momentum': 0.9
-            }
-        },
-        'learning_rate': {
-            'type': 'exponential',
-            'exponential': {
-                'initial_learning_rate': 0.1,
-                'decay_steps': 1000,
-                'decay_rate': 0.96,
-                'staircase': True
-            }
-        }
-    }
-    expected_lr_step_values = [
-        [0, 0.1],
-        [999, 0.1],
-        [1000, 0.096],
-        [1999, 0.096],
-        [2000, 0.09216],
-    ]
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-
-    for step, value in expected_lr_step_values:
-      self.assertAlmostEqual(lr(step).numpy(), value)
-
-  def test_polynomial_lr_schedule(self):
-    params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {
-                'momentum': 0.9
-            }
-        },
-        'learning_rate': {
-            'type': 'polynomial',
-            'polynomial': {
-                'initial_learning_rate': 0.1,
-                'decay_steps': 1000,
-                'end_learning_rate': 0.001
-            }
-        }
-    }
-
-    expected_lr_step_values = [[0, 0.1], [500, 0.0505], [1000, 0.001]]
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-
-    for step, value in expected_lr_step_values:
-      self.assertAlmostEqual(lr(step).numpy(), value)
-
-  def test_cosine_lr_schedule(self):
-    params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {
-                'momentum': 0.9
-            }
-        },
-        'learning_rate': {
-            'type': 'cosine',
-            'cosine': {
-                'initial_learning_rate': 0.1,
-                'decay_steps': 1000
-            }
-        }
-    }
-    expected_lr_step_values = [[0, 0.1], [250, 0.08535534], [500, 0.04999999],
-                               [750, 0.01464466], [1000, 0]]
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-
-    for step, value in expected_lr_step_values:
-      self.assertAlmostEqual(lr(step).numpy(), value)
-
-  def test_constant_lr_with_warmup_schedule(self):
-    params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {
-                'momentum': 0.9
-            }
-        },
-        'learning_rate': {
-            'type': 'constant',
-            'constant': {
-                'learning_rate': 0.1
-            }
-        },
-        'warmup': {
-            'type': 'linear',
-            'linear': {
-                'warmup_steps': 500,
-                'warmup_learning_rate': 0.01
-            }
-        }
-    }
-
-    expected_lr_step_values = [[0, 0.01], [250, 0.055], [500, 0.1], [5000, 0.1],
-                               [10000, 0.1], [20000, 0.1]]
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-
-    for step, value in expected_lr_step_values:
-      self.assertAlmostEqual(lr(step).numpy(), value)
-
-  def test_stepwise_lr_with_polynomial_warmup_schedule(self):
-    params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {
-                'momentum': 0.9
-            }
-        },
-        'learning_rate': {
-            'type': 'stepwise',
-            'stepwise': {
-                'boundaries': [10000, 20000],
-                'values': [0.1, 0.01, 0.001]
-            }
-        },
-        'warmup': {
-            'type': 'polynomial',
-            'polynomial': {
-                'warmup_steps': 500,
-                'power': 2.
-            }
-        }
-    }
-    expected_lr_step_values = [[0, 0.0], [250, 0.025], [500, 0.1], [5500, 0.1],
-                               [10000, 0.1], [10001, 0.01], [20000, 0.01],
-                               [20001, 0.001]]
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-
-    for step, value in expected_lr_step_values:
-      self.assertAlmostEqual(lr(step).numpy(), value, places=6)
-
-  def test_power_lr_schedule(self):
-    params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {
-                'momentum': 0.9
-            }
-        },
-        'learning_rate': {
-            'type': 'power',
-            'power': {
-                'initial_learning_rate': 1.0,
-                'power': -1.0
-            }
-        }
-    }
-    expected_lr_step_values = [[0, 1.0], [1, 1.0], [250, 1. / 250.]]
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-
-    for step, value in expected_lr_step_values:
-      self.assertAlmostEqual(lr(step).numpy(), value)
-
-  def test_power_linear_lr_schedule(self):
-    params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {
-                'momentum': 0.9
-            }
-        },
-        'learning_rate': {
-            'type': 'power_linear',
-            'power_linear': {
-                'initial_learning_rate': 1.0,
-                'power': -1.0,
-                'linear_decay_fraction': 0.5,
-                'total_decay_steps': 100,
-                'offset': 0,
-            }
-        }
-    }
-    expected_lr_step_values = [[0, 1.0], [1, 1.0], [40, 1. / 40.],
-                               [60, 1. / 60. * 0.8]]
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-
-    for step, value in expected_lr_step_values:
-      self.assertAlmostEqual(lr(step).numpy(), value)
-
-  def test_power_with_offset_lr_schedule(self):
-    params = {
-        'optimizer': {
-            'type': 'sgd',
-            'sgd': {
-                'momentum': 0.9
-            }
-        },
-        'learning_rate': {
-            'type': 'power_with_offset',
-            'power_with_offset': {
-                'initial_learning_rate': 1.0,
-                'power': -1.0,
-                'offset': 10,
-                'pre_offset_learning_rate': 3.0,
-            }
-        }
-    }
-    expected_lr_step_values = [[1, 3.0], [10, 3.0], [20, 1. / 10.]]
-    opt_config = optimization_config.OptimizationConfig(params)
-    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
-    lr = opt_factory.build_learning_rate()
-
-    for step, value in expected_lr_step_values:
-      self.assertAlmostEqual(lr(step).numpy(), value)
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/performance.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/performance.py
deleted file mode 100644
index f3fe1cc1603bd0a93cff74458c4962fe21448725..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/performance.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Functions and classes related to training performance."""
-
-from absl import logging
-import tensorflow as tf
-import npu_device as npu
-'''
-def configure_optimizer(optimizer,
-                        use_float16=False,
-                        use_graph_rewrite=False,
-                        loss_scale="dynamic"):
-  """Configures optimizer object with performance options."""
-
-  if isinstance(loss_scale,(int,float)):
-    print("Use static npu loss scale with init loss scale {}".format(loss_scale),flush=True)
-    optimizer=(npu.train.optimizer.NpuLossScaleOptimizer(optimizer,dynamic=False,initial_scale=loss_scale))
-  elif loss_scale=="dynamic":
-    print("Use dynamic npu loss scale ",flush=True)
-    optimizer=(npu.train.optimizer.NpuLossScaleOptimizer(optimizer))
-  else:
-    raise RuntimeError("UnSupported npu loss scale value {}".format(loss_scale))
-
-  if use_graph_rewrite:
-    # Note: the model dtype must be 'float32', which will ensure
-    # tf.ckeras.mixed_precision and
-    # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
-    # up.
-    optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-        optimizer)
-  return optimizer
-
-'''
-def configure_optimizer(optimizer,
-                        use_float16=False,
-                        use_graph_rewrite=False,
-                        loss_scale='dynamic',
-                        use_experimental_api=False):
-  """Configures optimizer object with performance options."""
-  if use_experimental_api:
-    logging.warning('Passing use_experimental_api=True is deprecated. The '
-                    'argument will be removed in the future.')
-  if use_float16:
-    # TODO(b/171936854): Move all methods to non-experimental api.
-    if use_experimental_api:
-      # Wraps optimizer with a LossScaleOptimizer. This is done automatically
-      # in compile() with the "mixed_float16" policy, but since we do not call
-      # compile(), we must wrap the optimizer manually.
-      optimizer = (
-          tf.keras.mixed_precision.experimental.LossScaleOptimizer(
-              optimizer, loss_scale=loss_scale))
-    elif loss_scale == 'dynamic':
-      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
-    else:
-      # loss_scale is a number. We interpret that as a fixed loss scale.
-      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
-          optimizer, dynamic=False, initial_scale=loss_scale)
-  if use_graph_rewrite:
-    # Note: the model dtype must be 'float32', which will ensure
-    # tf.keras.mixed_precision and enable_mixed_precision_graph_rewrite do not
-    # double up.
-    optimizer = (
-        tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
-            optimizer))
-  return optimizer
-
-
-def set_mixed_precision_policy(dtype, loss_scale=None,
-                               use_experimental_api=False):
-  """Sets mix precision policy."""
-  if use_experimental_api:
-    logging.warning('Passing use_experimental_api=True is deprecated. The '
-                    'argument will be removed in the future.')
-  assert use_experimental_api or loss_scale is None, (
-      'loss_scale cannot be specified if use_experimental_api is False. If the '
-      'non-experimental API is used, specify the loss scaling configuration '
-      'when creating the LossScaleOptimizer instead.'
-  )
-  if dtype == tf.float16:
-    # TODO(b/171936854): Move all methods to non-experimental api.
-    if use_experimental_api:
-      policy = tf.keras.mixed_precision.experimental.Policy(
-          'mixed_float16', loss_scale=loss_scale)
-      tf.keras.mixed_precision.experimental.set_policy(policy)
-    else:
-      tf.keras.mixed_precision.set_global_policy('mixed_float16')
-  elif dtype == tf.bfloat16:
-    if use_experimental_api:
-      tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')
-    else:
-      tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
-  elif dtype == tf.float32:
-    if use_experimental_api:
-      tf.keras.mixed_precision.experimental.set_policy('float32')
-    else:
-      tf.keras.mixed_precision.set_global_policy('float32')
-  else:
-    raise ValueError('Unexpected dtype: %s' % dtype)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/policies.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/policies.py
deleted file mode 100644
index 879a3a61f9b3bc54d192c508a3f880a7aff58f21..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/policies.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Base ProgressivePolicy definition for progressive training.
-
-To write a progressive model, subclass ProgressivePolicy and implement its
-abstract methods to handle each training stage.
-"""
-
-import abc
-from typing import Any, Mapping
-from absl import logging
-import dataclasses
-import six
-import tensorflow as tf
-from official.modeling.hyperparams import base_config
-from official.modeling.progressive import utils
-
-
-@dataclasses.dataclass
-class ProgressiveConfig(base_config.Config):
-  pass
-
-
-@six.add_metaclass(abc.ABCMeta)
-class ProgressivePolicy:
-  """The APIs for handling progressive training stages.
-
-  Attributes:
-    cur_model: The model for the current progressive training stage.
-    cur_train_dataset: The train dataset function for the current stage.
-    cur_eval_dataset: The eval dataset function for the current stage.
-    cur_optimizer: The optimizer for the current stage.
-    cur_checkpoint_items: Items to be saved in and restored from checkpoints,
-      for the progressive trainer.
-    is_last_stage: Whether it is currently in the last stage.
-
-  Interfaces:
-    is_stage_advancing: Returns if progressive training is advancing to the
-      next stage.
-    update_pt_stage: Update progressive training stage.
-  """
-
-  def __init__(self):
-    """Initialize stage policy."""
-    self._cur_train_dataset = None
-    self._cur_eval_dataset = None
-    self._volatiles = utils.VolatileTrackable(optimizer=None, model=None)
-
-    stage_id = 0
-    self._stage_id = tf.Variable(
-        stage_id,
-        trainable=False,
-        dtype=tf.int64,
-        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
-        shape=[])
-    self._volatiles.reassign_trackable(
-        optimizer=self.get_optimizer(stage_id),
-        model=self.get_model(stage_id, old_model=None))
-
-  def compute_stage_id(self, global_step: int) -> int:
-    for stage_id in range(self.num_stages()):
-      global_step -= self.num_steps(stage_id)
-      if global_step < 0:
-        return stage_id
-    logging.error('Global step %d found no matching progressive stages. '
-                  'Default to the last stage.', global_step)
-    return self.num_stages() - 1
-
-  @abc.abstractmethod
-  def num_stages(self) -> int:
-    """Return the total number of progressive stages."""
-    pass
-
-  @abc.abstractmethod
-  def num_steps(self, stage_id: int) -> int:
-    """Return the total number of steps in this stage."""
-    pass
-
-  @abc.abstractmethod
-  def get_model(self,
-                stage_id: int,
-                old_model: tf.keras.Model = None) -> tf.keras.Model:
-    """Return model for this stage. For initialization, `old_model` = None."""
-    pass
-
-  @abc.abstractmethod
-  def get_optimizer(self, stage_id: int) -> tf.keras.optimizers.Optimizer:
-    """Return optimizer for this stage."""
-    pass
-
-  @abc.abstractmethod
-  def get_train_dataset(self, stage_id: int) -> tf.data.Dataset:
-    """Return training Dataset for this stage."""
-    pass
-
-  @abc.abstractmethod
-  def get_eval_dataset(self, stage_id: int) -> tf.data.Dataset:
-    """Return evaluation Dataset for this stage."""
-    pass
-
-  @property
-  def cur_model(self) -> tf.keras.Model:
-    return self._volatiles.model
-
-  @property
-  def cur_train_dataset(self) -> tf.data.Dataset:
-    if self._cur_train_dataset is None:
-      self._cur_train_dataset = self.get_train_dataset(self._stage_id.numpy())
-    return self._cur_train_dataset
-
-  @property
-  def cur_eval_dataset(self) -> tf.data.Dataset:
-    if self._cur_eval_dataset is None:
-      self._cur_eval_dataset = self.get_eval_dataset(self._stage_id.numpy())
-    return self._cur_eval_dataset
-
-  @property
-  def cur_optimizer(self) -> tf.keras.optimizers.Optimizer:
-    return self._volatiles.optimizer
-
-  @property
-  def is_last_stage(self) -> bool:
-    stage_id = self._stage_id.numpy()
-    return stage_id >= self.num_stages() - 1
-
-  @property
-  def cur_checkpoint_items(self) -> Mapping[str, Any]:
-    return dict(stage_id=self._stage_id, volatiles=self._volatiles)
-
-  def is_stage_advancing(self, global_step: int) -> bool:
-    old_stage_id = self._stage_id.numpy()
-    new_stage_id = self.compute_stage_id(global_step)
-    return old_stage_id != new_stage_id
-
-  def update_pt_stage(self, global_step: int, pass_old_model=True) -> None:
-    """Update progressive training internal status.
-
-    Call this after a training loop ends.
-
-    Args:
-      global_step: an integer scalar of the current global step.
-      pass_old_model: whether to pass the old_model to get_model() function.
-        This is set to False if the old_model is irrelevant (e.g, just a default
-        model from stage 0).
-    """
-    old_stage_id = self._stage_id.numpy()
-    new_stage_id = self.compute_stage_id(global_step)
-    logging.info('Switching stage from %d to %d', old_stage_id, new_stage_id)
-
-    # Update stage id.
-    self._stage_id.assign(new_stage_id)
-    # Update dataset function.
-    self._cur_train_dataset = None
-    self._cur_eval_dataset = None
-
-    # Update optimizer and model.
-    new_optimizer = self.get_optimizer(new_stage_id)
-    self._volatiles.reassign_trackable(optimizer=new_optimizer)
-    new_model = self.get_model(
-        new_stage_id, old_model=self.cur_model if pass_old_model else None)
-    self._volatiles.reassign_trackable(model=new_model)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/train.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/train.py
deleted file mode 100644
index 0419792db81c2ebd4a0ad127bb56113ab3897a4d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/train.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""TFM binary for the progressive trainer."""
-
-from absl import app
-from absl import flags
-import gin
-
-from official.common import distribute_utils
-# pylint: disable=unused-import
-from official.common import registry_imports
-# pylint: enable=unused-import
-from official.common import flags as tfm_flags
-from official.core import task_factory
-from official.core import train_utils
-from official.modeling import performance
-from official.modeling.progressive import train_lib
-
-FLAGS = flags.FLAGS
-
-
-def main(_):
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
-  params = train_utils.parse_configuration(FLAGS)
-  model_dir = FLAGS.model_dir
-  if 'train' in FLAGS.mode:
-    # Pure eval modes do not output yaml files. Otherwise continuous eval job
-    # may race against the train job for writing the same file.
-    train_utils.serialize_config(params, model_dir)
-
-  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
-  # can have significant impact on model speeds by utilizing float16 in case of
-  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
-  # dtype is float16
-  if params.runtime.mixed_precision_dtype:
-    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
-  distribution_strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=params.runtime.distribution_strategy,
-      all_reduce_alg=params.runtime.all_reduce_alg,
-      num_gpus=params.runtime.num_gpus,
-      tpu_address=params.runtime.tpu,
-      **params.runtime.model_parallelism())
-  with distribution_strategy.scope():
-    task = task_factory.get_task(params.task, logging_dir=model_dir)
-
-  train_lib.run_experiment(
-      distribution_strategy=distribution_strategy,
-      task=task,
-      mode=FLAGS.mode,
-      params=params,
-      model_dir=model_dir)
-
-  train_utils.save_gin_config(FLAGS.mode, model_dir)
-
-if __name__ == '__main__':
-  tfm_flags.define_flags()
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/train_lib.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/train_lib.py
deleted file mode 100644
index 7334aa6be22b9aece7b6c1bd5e93657e5e39219d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/train_lib.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""TFM progressive training driver library.
-
-Compared to the common training driver, the only difference is that we use
-prog_trainer_lib.ProgressiveTrainer instead of the base trainer.
-"""
-
-# pytype: disable=attribute-error
-import os
-from typing import Any, Mapping, Tuple
-
-# Import libraries
-from absl import logging
-import orbit
-import tensorflow as tf
-from official.core import base_task
-from official.core import config_definitions
-from official.core import train_lib as base_train_lib
-from official.modeling.progressive import trainer as prog_trainer_lib
-
-
-def run_experiment(distribution_strategy: tf.distribute.Strategy,
-                   task: base_task.Task,
-                   mode: str,
-                   params: config_definitions.ExperimentConfig,
-                   model_dir: str,
-                   run_post_eval: bool = False,
-                   save_summary: bool = True) \
--> Tuple[tf.keras.Model, Mapping[str, Any]]:
-  """Runs train/eval configured by the experiment params.
-
-  Args:
-    distribution_strategy: A distribution distribution_strategy.
-    task: A Task instance.
-    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
-      or 'continuous_eval'.
-    params: ExperimentConfig instance.
-    model_dir: A 'str', a path to store model checkpoints and summaries.
-    run_post_eval: Whether to run post eval once after training, metrics logs
-      are returned.
-    save_summary: Whether to save train and validation summary.
-
-  Returns:
-    A 2-tuple of (model, eval_logs).
-      model: `tf.keras.Model` instance.
-      eval_logs: returns eval metrics logs when run_post_eval is set to True,
-        otherwise, returns {}.
-  """
-
-  with distribution_strategy.scope():
-    logging.info('Running progressive trainer.')
-    trainer = prog_trainer_lib.ProgressiveTrainer(
-        params, task, ckpt_dir=model_dir,
-        train='train' in mode,
-        evaluate=('eval' in mode) or run_post_eval,
-        checkpoint_exporter=base_train_lib.maybe_create_best_ckpt_exporter(
-            params, model_dir))
-
-  if trainer.checkpoint:
-    checkpoint_manager = tf.train.CheckpointManager(
-        trainer.checkpoint,
-        directory=model_dir,
-        max_to_keep=params.trainer.max_to_keep,
-        step_counter=trainer.global_step,
-        checkpoint_interval=params.trainer.checkpoint_interval,
-        init_fn=trainer.initialize)
-  else:
-    checkpoint_manager = None
-
-  controller = orbit.Controller(
-      strategy=distribution_strategy,
-      trainer=trainer if 'train' in mode else None,
-      evaluator=trainer,
-      global_step=trainer.global_step,
-      steps_per_loop=params.trainer.steps_per_loop,
-      checkpoint_manager=checkpoint_manager,
-      summary_dir=os.path.join(model_dir, 'train') if (save_summary) else None,
-      eval_summary_dir=os.path.join(model_dir, 'validation') if
-      (save_summary) else None,
-      summary_interval=params.trainer.summary_interval if
-      (save_summary) else None)
-
-  logging.info('Starts to execute mode: %s', mode)
-  with distribution_strategy.scope():
-    if mode == 'train':
-      controller.train(steps=params.trainer.train_steps)
-    elif mode == 'train_and_eval':
-      controller.train_and_evaluate(
-          train_steps=params.trainer.train_steps,
-          eval_steps=params.trainer.validation_steps,
-          eval_interval=params.trainer.validation_interval)
-    elif mode == 'eval':
-      controller.evaluate(steps=params.trainer.validation_steps)
-    elif mode == 'continuous_eval':
-
-      def timeout_fn():
-        if trainer.global_step.numpy() >= params.trainer.train_steps:
-          return True
-        return False
-
-      controller.evaluate_continuously(
-          steps=params.trainer.validation_steps,
-          timeout=params.trainer.continuous_eval_timeout,
-          timeout_fn=timeout_fn)
-    else:
-      raise NotImplementedError('The mode is not implemented: %s' % mode)
-
-  if run_post_eval:
-    with distribution_strategy.scope():
-      return trainer.model, trainer.evaluate(
-          tf.convert_to_tensor(params.trainer.validation_steps))
-  else:
-    return trainer.model, {}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/train_lib_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/train_lib_test.py
deleted file mode 100644
index f69a862d028ce31e6536583d26d1ef0bd4effdcc..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/train_lib_test.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for the progressive train_lib."""
-import os
-
-from absl import flags
-from absl.testing import parameterized
-import dataclasses
-import orbit
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.common import flags as tfm_flags
-# pylint: disable=unused-import
-from official.common import registry_imports
-# pylint: enable=unused-import
-from official.core import config_definitions as cfg
-from official.core import task_factory
-from official.modeling import optimization
-from official.modeling.hyperparams import params_dict
-from official.modeling.progressive import policies
-from official.modeling.progressive import train_lib
-from official.modeling.progressive import trainer as prog_trainer_lib
-from official.utils.testing import mock_task
-
-FLAGS = flags.FLAGS
-
-tfm_flags.define_flags()
-
-
-@dataclasses.dataclass
-class ProgTaskConfig(cfg.TaskConfig):
-  pass
-
-
-@task_factory.register_task_cls(ProgTaskConfig)
-class ProgMockTask(policies.ProgressivePolicy, mock_task.MockTask):
-  """Progressive task for testing."""
-
-  def __init__(self, params: cfg.TaskConfig, logging_dir: str = None):
-    mock_task.MockTask.__init__(
-        self, params=params, logging_dir=logging_dir)
-    policies.ProgressivePolicy.__init__(self)
-
-  def num_stages(self):
-    return 2
-
-  def num_steps(self, stage_id):
-    return 2 if stage_id == 0 else 4
-
-  def get_model(self, stage_id, old_model=None):
-    del stage_id, old_model
-    return self.build_model()
-
-  def get_optimizer(self, stage_id):
-    """Build optimizer for each stage."""
-    params = optimization.OptimizationConfig({
-        'optimizer': {
-            'type': 'adamw',
-        },
-        'learning_rate': {
-            'type': 'polynomial',
-            'polynomial': {
-                'initial_learning_rate': 0.01,
-                'end_learning_rate': 0.0,
-                'power': 1.0,
-                'decay_steps': 10,
-            },
-        },
-        'warmup': {
-            'polynomial': {
-                'power': 1,
-                'warmup_steps': 2,
-            },
-            'type': 'polynomial',
-        }
-    })
-    opt_factory = optimization.OptimizerFactory(params)
-    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
-
-    return optimizer
-
-  def get_train_dataset(self, stage_id):
-    del stage_id
-    strategy = tf.distribute.get_strategy()
-    return orbit.utils.make_distributed_dataset(
-        strategy, self.build_inputs, None)
-
-  def get_eval_dataset(self, stage_id):
-    del stage_id
-    strategy = tf.distribute.get_strategy()
-    return orbit.utils.make_distributed_dataset(
-        strategy, self.build_inputs, None)
-
-
-class TrainTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(TrainTest, self).setUp()
-    self._test_config = {
-        'trainer': {
-            'checkpoint_interval': 10,
-            'steps_per_loop': 10,
-            'summary_interval': 10,
-            'train_steps': 10,
-            'validation_steps': 5,
-            'validation_interval': 10,
-            'continuous_eval_timeout': 1,
-            'optimizer_config': {
-                'optimizer': {
-                    'type': 'sgd',
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                }
-            }
-        },
-    }
-
-  @combinations.generate(
-      combinations.combine(
-          distribution_strategy=[
-              strategy_combinations.default_strategy,
-              strategy_combinations.cloud_tpu_strategy,
-              strategy_combinations.one_device_strategy_gpu,
-          ],
-          flag_mode=['train', 'eval', 'train_and_eval'],
-          run_post_eval=[True, False]))
-  def test_end_to_end(self, distribution_strategy, flag_mode, run_post_eval):
-    model_dir = self.get_temp_dir()
-    experiment_config = cfg.ExperimentConfig(
-        trainer=prog_trainer_lib.ProgressiveTrainerConfig(),
-        task=ProgTaskConfig())
-    experiment_config = params_dict.override_params_dict(
-        experiment_config, self._test_config, is_strict=False)
-
-    with distribution_strategy.scope():
-      task = task_factory.get_task(experiment_config.task,
-                                   logging_dir=model_dir)
-
-    _, logs = train_lib.run_experiment(
-        distribution_strategy=distribution_strategy,
-        task=task,
-        mode=flag_mode,
-        params=experiment_config,
-        model_dir=model_dir,
-        run_post_eval=run_post_eval)
-
-    if run_post_eval:
-      self.assertNotEmpty(logs)
-    else:
-      self.assertEmpty(logs)
-
-    if flag_mode == 'eval':
-      return
-    self.assertNotEmpty(
-        tf.io.gfile.glob(os.path.join(model_dir, 'checkpoint')))
-    # Tests continuous evaluation.
-    _, logs = train_lib.run_experiment(
-        distribution_strategy=distribution_strategy,
-        task=task,
-        mode='continuous_eval',
-        params=experiment_config,
-        model_dir=model_dir,
-        run_post_eval=run_post_eval)
-    print(logs)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/trainer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/trainer.py
deleted file mode 100644
index 13bece1fdc2dc5070107b5d9a9e7c59074ed5c79..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/trainer.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Progressive Trainer implementation.
-
-The trainer implements the Orbit `StandardTrainable` and
-`StandardEvaluable` interfaces. Trainers inside this project should be
-interchangable and independent on model architectures and tasks.
-"""
-import os
-from typing import Any, Optional
-
-# Import libraries
-from absl import logging
-
-import dataclasses
-import gin
-import orbit
-import tensorflow as tf
-from official.core import base_task
-from official.core import base_trainer as trainer_lib
-from official.core import config_definitions
-from official.modeling.progressive import policies
-from official.modeling.progressive import utils
-
-ExperimentConfig = config_definitions.ExperimentConfig
-
-
-@dataclasses.dataclass
-class ProgressiveTrainerConfig(config_definitions.TrainerConfig):
-  """Configuration for progressive trainer.
-
-  Attributes:
-    progressive: A task-specific config. Users can subclass ProgressiveConfig
-      and define any task-specific settings in their subclass.
-    export_checkpoint: A bool. Whether to export checkpoints in non-progressive
-      manner (without the volatiles wrapper) such that your down-stream tasks
-      can load checkpoints from a progressive trainer as if it is a regular
-      checkpoint.
-    export_checkpoint_interval: A bool. The number of steps between exporting
-      checkpoints. If None (by default), will use the same value as
-      TrainerConfig.checkpoint_interval.
-    export_max_to_keep: The maximum number of exported checkpoints to keep.
-      If None (by default), will use the same value as
-      TrainerConfig.max_to_keep.
-    export_only_final_stage_ckpt: A bool. Whether to just export checkpoints
-      during the final progressive training stage. In other words, whether to
-      not export small, partial models. In many cases, it is not meaningful to
-      finetune a small, partial model in down-stream tasks.
-  """
-  progressive: Optional[policies.ProgressiveConfig] = None
-  export_checkpoint: bool = True
-  export_checkpoint_interval: Optional[int] = None
-  export_max_to_keep: Optional[int] = None
-  export_only_final_stage_ckpt: bool = True
-
-
-@gin.configurable
-class ProgressiveTrainer(trainer_lib.Trainer):
-  """Implements the progressive trainer shared for TensorFlow models."""
-
-  def __init__(
-      self,
-      config: ExperimentConfig,
-      prog_task: base_task.Task,  # also implemented ProgressivePolicy.
-      ckpt_dir: str = '',
-      train: bool = True,
-      evaluate: bool = True,
-      checkpoint_exporter: Any = None):
-    """Initialize common trainer for TensorFlow models.
-
-    Args:
-      config: An `ExperimentConfig` instance specifying experiment config.
-      prog_task: An instance both implemented policies.ProgressivePolicy and
-        base_task.Task.
-      ckpt_dir: Checkpoint directory.
-      train: bool, whether or not this trainer will be used for training.
-        default to True.
-      evaluate: bool, whether or not this trainer will be used for evaluation.
-        default to True.
-      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
-        interface.
-    """
-    # Gets the current distribution strategy. If not inside any strategy scope,
-    # it gets a single-replica no-op strategy.
-    self._strategy = tf.distribute.get_strategy()
-    self._config = config
-    self._runtime_options = trainer_lib.get_runtime_options(config)
-    self._task = prog_task
-
-    # Directory for non-progressive checkpoint
-    self._export_ckpt_dir = os.path.join(ckpt_dir, 'exported_ckpts')
-    tf.io.gfile.makedirs(self._export_ckpt_dir)
-    self._export_ckpt_manager = None
-
-    # Receive other checkpoint export, e.g, best checkpoint exporter.
-    # TODO(lehou): unify the checkpoint exporting logic, although the default
-    # setting does not use checkpoint_exporter.
-    self._checkpoint_exporter = checkpoint_exporter
-
-    self._global_step = orbit.utils.create_global_step()
-
-    self._checkpoint = utils.CheckpointWithHooks(
-        before_load_hook=self._update_pt_stage_from_ckpt,
-        global_step=self.global_step,
-        **self._task.cur_checkpoint_items)
-
-    self._train_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
-    self._validation_loss = tf.keras.metrics.Mean(
-        'validation_loss', dtype=tf.float32)
-    self._train_metrics = self.task.build_metrics(
-        training=True) + self.model.metrics
-    self._validation_metrics = self.task.build_metrics(
-        training=False) + self.model.metrics
-
-    if train:
-      orbit.StandardTrainer.__init__(
-          self,
-          None,  # Manage train_dataset by ourselves, not by StandardTrainer.
-          options=orbit.StandardTrainerOptions(
-              use_tf_while_loop=config.trainer.train_tf_while_loop,
-              use_tf_function=config.trainer.train_tf_function))
-
-    if evaluate:
-      orbit.StandardEvaluator.__init__(
-          self,
-          None,  # Manage train_dataset by ourselves, not by StandardEvaluator.
-          options=orbit.StandardEvaluatorOptions(
-              use_tf_function=config.trainer.eval_tf_function))
-
-  @property
-  def model(self):
-    return self._task.cur_model
-
-  @property
-  def optimizer(self):
-    return self._task.cur_optimizer
-
-  # override
-  @property
-  def train_dataset(self):
-    """Overriding StandardTrainer.train_dataset."""
-    return self._task.cur_train_dataset
-
-  # override
-  @train_dataset.setter
-  def train_dataset(self, _):
-    raise SyntaxError('Please do not set train_dataset. Progressive training '
-                      'relies on progressive policy to manager train dataset.')
-
-  # override
-  @property
-  def eval_dataset(self):
-    """Overriding StandardEvaluator.eval_dataset."""
-    return self._task.cur_eval_dataset
-
-  # override
-  @eval_dataset.setter
-  def eval_dataset(self, _):
-    raise SyntaxError('Please do not set eval_dataset. Progressive training '
-                      'relies on progressive policy to manager eval dataset.')
-
-  def train_loop_end(self):
-    """See base class."""
-    logs = {}
-    for metric in self.train_metrics + [self.train_loss]:
-      logs[metric.name] = metric.result()
-      metric.reset_states()
-    if callable(self.optimizer.learning_rate):
-      logs['learning_rate'] = self.optimizer.learning_rate(
-          self.optimizer.iterations)
-    else:
-      logs['learning_rate'] = self.optimizer.learning_rate
-
-    self._maybe_export_non_progressive_checkpoint(self._export_ckpt_dir)
-    if self._task.is_stage_advancing(self.global_step.numpy()):
-      old_train_dataset = self.train_dataset
-
-      # Update progressive properties
-      self._task.update_pt_stage(self.global_step.numpy())
-
-      # Setting `self._train_loop_fn` and `self._eval_loop_fn` to None will
-      # rebuild the train and eval functions with the updated model.
-      self._train_loop_fn = None
-      self._eval_loop_fn = None
-
-      if self.train_dataset != old_train_dataset:
-        # Setting `self._train_iter` to None will rebuild the dataset iterator.
-        self._train_iter = None
-
-      # Setting `self._export_ckpt_manager` to None will rebuild the checkpoint
-      # for exporting.
-      self._export_ckpt_manager = None
-
-    return logs
-
-  def _update_pt_stage_from_ckpt(self, ckpt_file):
-    """Update stage properties based on the global_step variable in a ckpt file.
-
-    Before loading variables from a checkpoint file, we need to go to the
-    correct stage and build corresponding model and optimizer, to make sure that
-    we retore variables of the right model and optimizer.
-
-    Args:
-      ckpt_file: Checkpoint file that will be restored/read from.
-    """
-    if not ckpt_file:
-      return
-    ckpt = tf.train.Checkpoint(global_step=self.global_step)
-    ckpt.read(ckpt_file).expect_partial().assert_existing_objects_matched()
-
-    if self._task.is_stage_advancing(self.global_step.numpy()):
-      old_train_dataset = self.train_dataset
-
-      # Update progressive properties
-      self._task.update_pt_stage(self.global_step.numpy(), pass_old_model=False)
-
-      # Setting `self._train_loop_fn` and `self._eval_loop_fn` to None will
-      # rebuild the train and eval functions with the updated model.
-      self._train_loop_fn = None
-      self._eval_loop_fn = None
-
-      if self.train_dataset != old_train_dataset:
-        # Setting `self._train_iter` to None will rebuild the dataset iterator.
-        self._train_iter = None
-
-      # Setting `self._export_ckpt_manager` to None will rebuild the checkpoint
-      # for exporting.
-      self._export_ckpt_manager = None
-
-  def _maybe_export_non_progressive_checkpoint(self, export_ckpt_dir):
-    """Export checkpoints in non-progressive format.
-
-    This basically removes the wrapping of self._task.cur_checkpoint_items
-    -- just save the model, optimizer, etc., directly.
-    The purpose is to let your down-stream tasks to use these checkpoints.
-
-    Args:
-      export_ckpt_dir: A str. folder of exported checkpoints.
-    """
-    if not self.config.trainer.export_checkpoint:
-      logging.info('Not exporting checkpoints.')
-      return
-    if not self._task.is_last_stage and (
-        self.config.trainer.export_only_final_stage_ckpt):
-      logging.info('Not exporting checkpoints until the last stage.')
-      return
-
-    if self._export_ckpt_manager is None:
-      # Create a checkpoint object just now, to make sure we use
-      # progressive_policy.cur_model and progressive_policy.cur_optimizer of the
-      # current stage.
-      if hasattr(self.model, 'checkpoint_items'):
-        checkpoint_items = self.model.checkpoint_items
-      else:
-        checkpoint_items = {}
-      checkpoint = tf.train.Checkpoint(
-          global_step=self.global_step,
-          model=self.model,
-          optimizer=self.optimizer,
-          **checkpoint_items)
-
-      max_to_keep = self.config.trainer.export_max_to_keep or (
-          self.config.trainer.max_to_keep)
-      checkpoint_interval = self.config.trainer.export_checkpoint_interval or (
-          self.config.trainer.checkpoint_interval)
-      self._export_ckpt_manager = tf.train.CheckpointManager(
-          checkpoint,
-          directory=export_ckpt_dir,
-          checkpoint_name='ckpt',
-          step_counter=self.global_step,
-          max_to_keep=max_to_keep,
-          checkpoint_interval=checkpoint_interval,
-      )
-
-    checkpoint_path = self._export_ckpt_manager.save(
-        checkpoint_number=self.global_step.numpy(),
-        check_interval=True)
-    if checkpoint_path:
-      logging.info('Checkpoints exported: %s.', checkpoint_path)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/trainer_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/trainer_test.py
deleted file mode 100644
index 7969caa0d1fe371163d40c94d9cd7334a23cb49c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/trainer_test.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for the progressive trainer."""
-# pylint: disable=g-direct-tensorflow-import
-import os
-
-from absl.testing import parameterized
-import orbit
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.core import config_definitions as cfg
-from official.modeling import optimization
-from official.modeling.progressive import policies
-from official.modeling.progressive import trainer as trainer_lib
-from official.nlp.configs import bert
-from official.utils.testing import mock_task
-
-
-def all_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.default_strategy,
-          strategy_combinations.cloud_tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-      ],)
-
-
-def get_exp_config():
-  return cfg.ExperimentConfig(
-      task=cfg.TaskConfig(
-          model=bert.PretrainerConfig()),
-      trainer=trainer_lib.ProgressiveTrainerConfig(
-          export_checkpoint=True,
-          export_checkpoint_interval=1,
-          export_only_final_stage_ckpt=False))
-
-
-class TestPolicy(policies.ProgressivePolicy, mock_task.MockTask):
-  """Just for testing purposes."""
-
-  def __init__(self, strategy, task_config, change_train_dataset=True):
-    self._strategy = strategy
-    self._change_train_dataset = change_train_dataset
-    self._my_train_dataset = None
-    mock_task.MockTask.__init__(self, params=task_config, logging_dir=None)
-    policies.ProgressivePolicy.__init__(self)
-
-  def num_stages(self) -> int:
-    return 2
-
-  def num_steps(self, stage_id: int) -> int:
-    return 2 if stage_id == 0 else 4
-
-  def get_model(self,
-                stage_id: int,
-                old_model: tf.keras.Model) -> tf.keras.Model:
-    del stage_id, old_model
-    return self.build_model()
-
-  def get_optimizer(self, stage_id: int) -> tf.keras.optimizers.Optimizer:
-    optimizer_type = 'sgd' if stage_id == 0 else 'adamw'
-    optimizer_config = cfg.OptimizationConfig({
-        'optimizer': {'type': optimizer_type},
-        'learning_rate': {'type': 'constant'}})
-    opt_factory = optimization.OptimizerFactory(optimizer_config)
-    return opt_factory.build_optimizer(opt_factory.build_learning_rate())
-
-  def get_train_dataset(self, stage_id: int) -> tf.data.Dataset:
-    if not self._change_train_dataset and self._my_train_dataset:
-      return self._my_train_dataset
-    if self._strategy:
-      self._my_train_dataset = orbit.utils.make_distributed_dataset(
-          self._strategy,
-          self._build_inputs,
-          stage_id)
-    else:
-      self._my_train_dataset = self._build_inputs(stage_id)
-    return self._my_train_dataset
-
-  def get_eval_dataset(self, stage_id: int) -> tf.data.Dataset:
-    if self._strategy:
-      return orbit.utils.make_distributed_dataset(
-          self._strategy,
-          self._build_inputs,
-          stage_id)
-    return self._build_inputs(stage_id)
-
-  def _build_inputs(self, stage_id):
-    def dummy_data(_):
-      batch_size = 2 if stage_id == 0 else 1
-      x = tf.zeros(shape=(batch_size, 2), dtype=tf.float32)
-      label = tf.zeros(shape=(batch_size, 1), dtype=tf.float32)
-      return x, label
-    dataset = tf.data.Dataset.range(1)
-    dataset = dataset.repeat()
-    return dataset.map(
-        dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-
-class TrainerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(TrainerTest, self).setUp()
-    self._config = get_exp_config()
-
-  def create_test_trainer(self, distribution, model_dir, change_train_dataset):
-    trainer = trainer_lib.ProgressiveTrainer(
-        self._config,
-        prog_task=TestPolicy(
-            distribution, self._config.task, change_train_dataset),
-        ckpt_dir=model_dir)
-    return trainer
-
-  @combinations.generate(all_strategy_combinations())
-  def test_checkpointing(self, distribution):
-    model_dir = self.get_temp_dir()
-    ckpt_file = os.path.join(model_dir, 'ckpt')
-    with distribution.scope():
-      trainer = self.create_test_trainer(distribution, model_dir, True)
-      self.assertFalse(trainer._task.is_last_stage)
-      trainer.train(tf.convert_to_tensor(4, dtype=tf.int32))
-      self.assertTrue(trainer._task.is_last_stage)
-      trainer.checkpoint.save(ckpt_file)
-
-      trainer = self.create_test_trainer(distribution, model_dir, True)
-      self.assertFalse(trainer._task.is_last_stage)
-      trainer.checkpoint.restore(ckpt_file + '-1')
-      self.assertTrue(trainer._task.is_last_stage)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_train_dataset(self, distribution):
-    model_dir = self.get_temp_dir()
-    with distribution.scope():
-      trainer = self.create_test_trainer(distribution, model_dir, True)
-      # Using dataset of stage == 0
-      train_iter = tf.nest.map_structure(iter, trainer.train_dataset)
-      train_data = train_iter.next()[0]
-      if distribution.num_replicas_in_sync > 1:
-        train_data = train_data.values[0]
-      self.assertEqual(train_data.shape[0], 2)
-
-      trainer.train(tf.convert_to_tensor(4, dtype=tf.int32))
-      # Using dataset of stage == 1
-      train_iter = tf.nest.map_structure(iter, trainer.train_dataset)
-      train_data = train_iter.next()[0]
-      if distribution.num_replicas_in_sync > 1:
-        train_data = train_data.values[0]
-      self.assertEqual(train_data.shape[0], 1)
-
-      with self.assertRaises(SyntaxError):
-        trainer.train_dataset = None
-
-  @combinations.generate(all_strategy_combinations())
-  def test_train_dataset_no_switch(self, distribution):
-    model_dir = self.get_temp_dir()
-    with distribution.scope():
-      trainer = self.create_test_trainer(distribution, model_dir, False)
-      trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
-      # _train_iter is not reset since the dataset is not changed.
-      self.assertIsNotNone(trainer._train_iter)
-    with distribution.scope():
-      trainer = self.create_test_trainer(distribution, model_dir, True)
-      trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
-      # _train_iter is reset since the dataset changed.
-      self.assertIsNone(trainer._train_iter)
-
-
-class TrainerWithMaskedLMTaskTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(TrainerWithMaskedLMTaskTest, self).setUp()
-    self._config = get_exp_config()
-
-  def create_test_trainer(self, distribution):
-    trainer = trainer_lib.ProgressiveTrainer(
-        self._config,
-        prog_task=TestPolicy(distribution, self._config.task),
-        ckpt_dir=self.get_temp_dir())
-    return trainer
-
-  @combinations.generate(all_strategy_combinations())
-  def test_trainer_train(self, distribution):
-    with distribution.scope():
-      trainer = self.create_test_trainer(distribution)
-      logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertIn('training_loss', logs)
-      self.assertIn('learning_rate', logs)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_trainer_validate(self, distribution):
-    with distribution.scope():
-      trainer = self.create_test_trainer(distribution)
-      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertIn('validation_loss', logs)
-      self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
-
-  @combinations.generate(
-      combinations.combine(
-          mixed_precision_dtype=['float32', 'bfloat16', 'float16'],
-          loss_scale=[None, 'dynamic', 128, 256],
-      ))
-  def test_configure_optimizer(self, mixed_precision_dtype, loss_scale):
-    config = cfg.ExperimentConfig(
-        task=cfg.TaskConfig(
-            model=bert.PretrainerConfig()),
-        runtime=cfg.RuntimeConfig(
-            mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale),
-        trainer=trainer_lib.ProgressiveTrainerConfig(
-            export_checkpoint=True,
-            export_checkpoint_interval=1,
-            export_only_final_stage_ckpt=False))
-    task = TestPolicy(None, config.task)
-    trainer = trainer_lib.ProgressiveTrainer(config, task, self.get_temp_dir())
-    if mixed_precision_dtype != 'float16':
-      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
-    elif mixed_precision_dtype == 'float16' and loss_scale is None:
-      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
-
-    metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
-    self.assertIn('training_loss', metrics)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/utils.py
deleted file mode 100644
index aa2c5523509011ee0fb7fa74cb870a97f22e88fb..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/progressive/utils.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Util classes and functions."""
-
-from absl import logging
-import tensorflow as tf
-
-# pylint: disable=g-direct-tensorflow-import
-from tensorflow.python.training.tracking import tracking
-
-
-class VolatileTrackable(tracking.AutoTrackable):
-  """A util class to keep Trackables that might change instances."""
-
-  def __init__(self, **kwargs):
-    for k, v in kwargs.items():
-      setattr(self, k, v)
-
-  def reassign_trackable(self, **kwargs):
-    for k, v in kwargs.items():
-      delattr(self, k)  # untrack this object
-      setattr(self, k, v)  # track the new object
-
-
-class CheckpointWithHooks(tf.train.Checkpoint):
-  """Same as tf.train.Checkpoint but supports hooks.
-
-  In progressive training, use this class instead of tf.train.Checkpoint.
-
-  Since the network architecture changes during progressive training, we need to
-  prepare something (like switch to the correct architecture) before loading the
-  checkpoint. This class supports a hook that will be executed before checkpoint
-  loading.
-  """
-
-  def __init__(self, before_load_hook, **kwargs):
-    self._before_load_hook = before_load_hook
-    super(CheckpointWithHooks, self).__init__(**kwargs)
-
-  # override
-  def read(self, save_path, options=None):
-    self._before_load_hook(save_path)
-    logging.info('Ran before_load_hook.')
-    super(CheckpointWithHooks, self).read(save_path=save_path, options=options)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/tf_utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/tf_utils.py
deleted file mode 100644
index c8ec65d1fda5b7f53581a6770fcb9687eff47252..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/modeling/tf_utils.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Common TF utilities."""
-
-import six
-import tensorflow as tf
-
-from tensorflow.python.util import deprecation
-from official.modeling import activations
-
-
-@deprecation.deprecated(
-    None,
-    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
-    "input tensors. pack/unpack inputs to override __call__ is no longer "
-    "needed.")
-def pack_inputs(inputs):
-  """Pack a list of `inputs` tensors to a tuple.
-
-  Args:
-    inputs: a list of tensors.
-
-  Returns:
-    a tuple of tensors. if any input is None, replace it with a special constant
-    tensor.
-  """
-  inputs = tf.nest.flatten(inputs)
-  outputs = []
-  for x in inputs:
-    if x is None:
-      outputs.append(tf.constant(0, shape=[], dtype=tf.int32))
-    else:
-      outputs.append(x)
-  return tuple(outputs)
-
-
-@deprecation.deprecated(
-    None,
-    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
-    "input tensors. pack/unpack inputs to override __call__ is no longer "
-    "needed.")
-def unpack_inputs(inputs):
-  """unpack a tuple of `inputs` tensors to a tuple.
-
-  Args:
-    inputs: a list of tensors.
-
-  Returns:
-    a tuple of tensors. if any input is a special constant tensor, replace it
-    with None.
-  """
-  inputs = tf.nest.flatten(inputs)
-  outputs = []
-  for x in inputs:
-    if is_special_none_tensor(x):
-      outputs.append(None)
-    else:
-      outputs.append(x)
-  x = tuple(outputs)
-
-  # To trick the very pointless 'unbalanced-tuple-unpacking' pylint check
-  # from triggering.
-  if len(x) == 1:
-    return x[0]
-  return tuple(outputs)
-
-
-def is_special_none_tensor(tensor):
-  """Checks if a tensor is a special None Tensor."""
-  return tensor.shape.ndims == 0 and tensor.dtype == tf.int32
-
-
-def get_activation(identifier, use_keras_layer=False):
-  """Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
-
-  It checks string first and if it is one of customized activation not in TF,
-  the corresponding activation will be returned. For non-customized activation
-  names and callable identifiers, always fallback to tf.keras.activations.get.
-
-  Prefers using keras layers when use_keras_layer=True. Now it only supports
-  'relu', 'linear', 'identity', 'swish'.
-
-  Args:
-    identifier: String name of the activation function or callable.
-    use_keras_layer: If True, use keras layer if identifier is allow-listed.
-
-  Returns:
-    A Python function corresponding to the activation function or a keras
-    activation layer when use_keras_layer=True.
-  """
-  if isinstance(identifier, six.string_types):
-    identifier = str(identifier).lower()
-    if use_keras_layer:
-      keras_layer_allowlist = {
-          "relu": "relu",
-          "linear": "linear",
-          "identity": "linear",
-          "swish": "swish",
-          "relu6": tf.nn.relu6,
-      }
-      if identifier in keras_layer_allowlist:
-        return tf.keras.layers.Activation(keras_layer_allowlist[identifier])
-    name_to_fn = {
-        "gelu": activations.gelu,
-        "simple_swish": activations.simple_swish,
-        "hard_swish": activations.hard_swish,
-        "relu6": activations.relu6,
-        "hard_sigmoid": activations.hard_sigmoid,
-        "identity": activations.identity,
-    }
-    if identifier in name_to_fn:
-      return tf.keras.activations.get(name_to_fn[identifier])
-  return tf.keras.activations.get(identifier)
-
-
-def get_shape_list(tensor, expected_rank=None, name=None):
-  """Returns a list of the shape of tensor, preferring static dimensions.
-
-  Args:
-    tensor: A tf.Tensor object to find the shape of.
-    expected_rank: (optional) int. The expected rank of `tensor`. If this is
-      specified and the `tensor` has a different rank, and exception will be
-      thrown.
-    name: Optional name of the tensor for the error message.
-
-  Returns:
-    A list of dimensions of the shape of tensor. All static dimensions will
-    be returned as python integers, and dynamic dimensions will be returned
-    as tf.Tensor scalars.
-  """
-  if expected_rank is not None:
-    assert_rank(tensor, expected_rank, name)
-
-  shape = tensor.shape.as_list()
-
-  non_static_indexes = []
-  for (index, dim) in enumerate(shape):
-    if dim is None:
-      non_static_indexes.append(index)
-
-  if not non_static_indexes:
-    return shape
-
-  dyn_shape = tf.shape(tensor)
-  for index in non_static_indexes:
-    shape[index] = dyn_shape[index]
-  return shape
-
-
-def assert_rank(tensor, expected_rank, name=None):
-  """Raises an exception if the tensor rank is not of the expected rank.
-
-  Args:
-    tensor: A tf.Tensor to check the rank of.
-    expected_rank: Python integer or list of integers, expected rank.
-    name: Optional name of the tensor for the error message.
-
-  Raises:
-    ValueError: If the expected shape doesn't match the actual shape.
-  """
-  expected_rank_dict = {}
-  if isinstance(expected_rank, six.integer_types):
-    expected_rank_dict[expected_rank] = True
-  else:
-    for x in expected_rank:
-      expected_rank_dict[x] = True
-
-  actual_rank = tensor.shape.ndims
-  if actual_rank not in expected_rank_dict:
-    raise ValueError(
-        "For the tensor `%s`, the actual tensor rank `%d` (shape = %s) is not "
-        "equal to the expected tensor rank `%s`" %
-        (name, actual_rank, str(tensor.shape), str(expected_rank)))
-
-
-def safe_mean(losses):
-  """Computes a safe mean of the losses.
-
-  Args:
-    losses: `Tensor` whose elements contain individual loss measurements.
-
-  Returns:
-    A scalar representing the mean of `losses`. If `num_present` is zero,
-      then zero is returned.
-  """
-  total = tf.reduce_sum(losses)
-  num_elements = tf.cast(tf.size(losses), dtype=losses.dtype)
-  return tf.math.divide_no_nan(total, num_elements)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/README.md
deleted file mode 100644
index dfa047b4ed3f0bea46e4b4db48578bd543f3e984..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/README.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# TensorFlow NLP Modelling Toolkit
-
-This codebase provides a Natrual Language Processing modeling toolkit written in
-[TF2](https://www.tensorflow.org/guide/effective_tf2). It allows researchers and
-developers to reproduce state-of-the-art model results and train custom models
-to experiment new research ideas.
-
-## Features
-
-* Reusable and modularized modeling building blocks
-* State-of-the-art reproducible
-* Easy to customize and extend
-* End-to-end training
-* Distributed trainable on both GPUs and TPUs
-
-## Major components
-
-### Libraries
-
-We provide modeling library to allow users to train custom models for new
-research ideas. Detailed intructions can be found in READMEs in each folder.
-
-*   [modeling/](modeling): modeling library that provides building blocks
-    (e.g.,Layers, Networks, and Models) that can be assembled into
-    transformer-based achitectures .
-*   [data/](data): binaries and utils for input preprocessing, tokenization,
-    etc.
-
-### State-of-the-Art models and examples
-
-We provide SoTA model implementations, pre-trained models, training and
-evaluation examples, and command lines. Detail instructions can be found in the
-READMEs for specific papers.
-
-1.  [BERT](bert): [BERT: Pre-training of Deep Bidirectional Transformers for
-    Language Understanding](https://arxiv.org/abs/1810.04805) by Devlin et al.,
-    2018
-2.  [ALBERT](albert):
-    [A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942)
-    by Lan et al., 2019
-3.  [XLNet](xlnet):
-    [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237)
-    by Yang et al., 2019
-4.  [Transformer for translation](transformer):
-    [Attention Is All You Need](https://arxiv.org/abs/1706.03762) by Vaswani et
-    al., 2017
-5.  [NHNet](nhnet):
-    [Generating Representative Headlines for News Stories](https://arxiv.org/abs/2001.09386)
-    by Gu et al, 2020
-
-### Common Training Driver
-
-We provide a single common driver [train.py](train.py) to train above SoTA
-models on popluar tasks. Please see [docs/train.md](docs/train.md) for
-more details.
-
-
-### Pre-trained models with checkpoints and TF-Hub
-
-We provide a large collection of baselines and checkpoints for NLP pre-trained
-models. Please see [docs/pretrained_models.md](docs/pretrained_models.md) for
-more details.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/README.md
deleted file mode 100644
index 037ff0b1ff8c6ea22bcf692bb8f786320b7d2d48..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/README.md
+++ /dev/null
@@ -1,395 +0,0 @@
-# BERT (Bidirectional Encoder Representations from Transformers)
-
-**WARNING**: We are on the way to deprecate most of the code in this directory.
-Please see
-[this link](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
-for the new tutorial and use the new code in `nlp/modeling`. This README is
-still correct for this legacy implementation.
-
-The academic paper which describes BERT in detail and provides full results on a
-number of tasks can be found here: https://arxiv.org/abs/1810.04805.
-
-This repository contains TensorFlow 2.x implementation for BERT.
-
-## Contents
-  * [Contents](#contents)
-  * [Pre-trained Models](#pre-trained-models)
-    * [Restoring from Checkpoints](#restoring-from-checkpoints)
-  * [Set Up](#set-up)
-  * [Process Datasets](#process-datasets)
-  * [Fine-tuning with BERT](#fine-tuning-with-bert)
-    * [Cloud GPUs and TPUs](#cloud-gpus-and-tpus)
-    * [Sentence and Sentence-pair Classification Tasks](#sentence-and-sentence-pair-classification-tasks)
-    * [SQuAD 1.1](#squad-1.1)
-
-
-## Pre-trained Models
-
-We released both checkpoints and tf.hub modules as the pretrained models for
-fine-tuning. They are TF 2.x compatible and are converted from the checkpoints
-released in TF 1.x official BERT repository
-[google-research/bert](https://github.com/google-research/bert)
-in order to keep consistent with BERT paper.
-
-
-### Access to Pretrained Checkpoints
-
-Pretrained checkpoints can be found in the following links:
-
-**Note: We have switched BERT implementation
-to use Keras functional-style networks in [nlp/modeling](../modeling).
-The new checkpoints are:**
-
-*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_uncased_L-24_H-1024_A-16.tar.gz)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Large, Cased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_cased_L-24_H-1024_A-16.tar.gz)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12.tar.gz)**:
-    12-layer, 768-hidden, 12-heads, 110M parameters
-*   **[`BERT-Large, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16.tar.gz)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz)**:
-    12-layer, 768-hidden, 12-heads , 110M parameters
-*   **[`BERT-Large, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Multilingual Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/multi_cased_L-12_H-768_A-12.tar.gz)**:
-    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-
-We recommend to host checkpoints on Google Cloud storage buckets when you use
-Cloud GPU/TPU.
-
-### Restoring from Checkpoints
-
-`tf.train.Checkpoint` is used to manage model checkpoints in TF 2. To restore
-weights from provided pre-trained checkpoints, you can use the following code:
-
-```python
-init_checkpoint='the pretrained model checkpoint path.'
-model=tf.keras.Model() # Bert pre-trained model as feature extractor.
-checkpoint = tf.train.Checkpoint(model=model)
-checkpoint.restore(init_checkpoint)
-```
-
-Checkpoints featuring native serialized Keras models
-(i.e. model.load()/load_weights()) will be available soon.
-
-### Access to Pretrained hub modules.
-
-Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the
-following links:
-
-*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Large, Cased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/)**:
-    12-layer, 768-hidden, 12-heads, 110M parameters
-*   **[`BERT-Large, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/)**:
-    12-layer, 768-hidden, 12-heads , 110M parameters
-*   **[`BERT-Large, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/)**:
-    24-layer, 1024-hidden, 16-heads, 340M parameters
-*   **[`BERT-Base, Multilingual Cased`](https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/)**:
-    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-*   **[`BERT-Base, Chinese`](https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/)**:
-    Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads,
-    110M parameters
-
-## Set Up
-
-```shell
-export PYTHONPATH="$PYTHONPATH:/path/to/models"
-```
-
-Install `tf-nightly` to get latest updates:
-
-```shell
-pip install tf-nightly-gpu
-```
-
-With TPU, GPU support is not necessary. First, you need to create a `tf-nightly`
-TPU with [ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
-
-```shell
-ctpu up -name <instance name> --tf-version=”nightly”
-```
-
-Second, you need to install TF 2 `tf-nightly` on your VM:
-
-```shell
-pip install tf-nightly
-```
-
-## Process Datasets
-
-### Pre-training
-
-There is no change to generate pre-training data. Please use the script
-[`../data/create_pretraining_data.py`](../data/create_pretraining_data.py)
-which is essentially branched from [BERT research repo](https://github.com/google-research/bert)
-to get processed pre-training data and it adapts to TF2 symbols and python3
-compatibility.
-
-Running the pre-training script requires an input and output directory, as well as a vocab file.  Note that max_seq_length will need to match the sequence length parameter you specify when you run pre-training.
-
-Example shell script to call create_pretraining_data.py
-```
-export WORKING_DIR='local disk or cloud location'
-export BERT_DIR='local disk or cloud location'
-python models/official/nlp/data/create_pretraining_data.py \
-  --input_file=$WORKING_DIR/input/input.txt \
-  --output_file=$WORKING_DIR/output/tf_examples.tfrecord \
-  --vocab_file=$BERT_DIR/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
-  --do_lower_case=True \
-  --max_seq_length=512 \
-  --max_predictions_per_seq=76 \
-  --masked_lm_prob=0.15 \
-  --random_seed=12345 \
-  --dupe_factor=5
-```
-
-### Fine-tuning
-
-To prepare the fine-tuning data for final model training, use the
-[`../data/create_finetuning_data.py`](../data/create_finetuning_data.py) script.
-Resulting datasets in `tf_record` format and training meta data should be later
-passed to training or evaluation scripts. The task-specific arguments are
-described in following sections:
-
-* GLUE
-
-Users can download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-Also, users can download [Pretrained Checkpoint](#access-to-pretrained-checkpoints) and locate on some directory `$BERT_DIR` instead of using checkpoints on Google Cloud Storage.
-
-```shell
-export GLUE_DIR=~/glue
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-
-export TASK_NAME=MNLI
-export OUTPUT_DIR=gs://some_bucket/datasets
-python ../data/create_finetuning_data.py \
- --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
- --vocab_file=${BERT_DIR}/vocab.txt \
- --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
- --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
- --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
- --fine_tuning_task_type=classification --max_seq_length=128 \
- --classification_task_name=${TASK_NAME}
-```
-
-* SQUAD
-
-The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
-detailed information about the SQuAD datasets and evaluation.
-
-The necessary files can be found here:
-
-*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
-*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
-*   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
-*   [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
-*   [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
-*   [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
-
-```shell
-export SQUAD_DIR=~/squad
-export SQUAD_VERSION=v1.1
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export OUTPUT_DIR=gs://some_bucket/datasets
-
-python ../data/create_finetuning_data.py \
- --squad_data_file=${SQUAD_DIR}/train-${SQUAD_VERSION}.json \
- --vocab_file=${BERT_DIR}/vocab.txt \
- --train_data_output_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
- --meta_data_file_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_meta_data \
- --fine_tuning_task_type=squad --max_seq_length=384
-```
-
-Note: To create fine-tuning data with SQUAD 2.0, you need to add flag `--version_2_with_negative=True`.
-
-## Fine-tuning with BERT
-
-### Cloud GPUs and TPUs
-
-* Cloud Storage
-
-The unzipped pre-trained model files can also be found in the Google Cloud
-Storage folder `gs://cloud-tpu-checkpoints/bert/keras_bert`. For example:
-
-```shell
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export MODEL_DIR=gs://some_bucket/my_output_dir
-```
-
-Currently, users are able to access to `tf-nightly` TPUs and the following TPU
-script should run with `tf-nightly`.
-
-* GPU -> TPU
-
-Just add the following flags to `run_classifier.py` or `run_squad.py`:
-
-```shell
-  --distribution_strategy=tpu
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-```
-
-### Sentence and Sentence-pair Classification Tasks
-
-This example code fine-tunes `BERT-Large` on the Microsoft Research Paraphrase
-Corpus (MRPC) corpus, which only contains 3,600 examples and can fine-tune in a
-few minutes on most GPUs.
-
-We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
-workflow.
-For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
-(uncased_L-12_H-768_A-12).
-
-```shell
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export GLUE_DIR=gs://some_bucket/datasets
-export TASK=MRPC
-
-python run_classifier.py \
-  --mode='train_and_eval' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=4 \
-  --eval_batch_size=4 \
-  --steps_per_loop=1 \
-  --learning_rate=2e-5 \
-  --num_train_epochs=3 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
-```
-
-Alternatively, instead of specifying `init_checkpoint`, you can specify
-`hub_module_url` to employ a pretraind BERT hub module, e.g.,
-` --hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1`.
-
-After training a model, to get predictions from the classifier, you can set the
-`--mode=predict` and offer the test set tfrecords to `--eval_data_path`.
-Output will be created in file called test_results.tsv in the output folder.
-Each line will contain output for each sample, columns are the class
-probabilities.
-
-```shell
-python run_classifier.py \
-  --mode='predict' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --eval_batch_size=4 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
-```
-
-To use TPU, you only need to switch distribution strategy type to `tpu` with TPU
-information and use remote storage for model checkpoints.
-
-```shell
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export TPU_IP_ADDRESS='???'
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export GLUE_DIR=gs://some_bucket/datasets
-export TASK=MRPC
-
-python run_classifier.py \
-  --mode='train_and_eval' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=32 \
-  --eval_batch_size=32 \
-  --steps_per_loop=1000 \
-  --learning_rate=2e-5 \
-  --num_train_epochs=3 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=tpu \
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-```
-
-Note that, we specify `steps_per_loop=1000` for TPU, because running a loop of
-training steps inside a `tf.function` can significantly increase TPU utilization
-and callbacks will not be called inside the loop.
-
-### SQuAD 1.1
-
-The Stanford Question Answering Dataset (SQuAD) is a popular question answering
-benchmark dataset. See more in [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/).
-
-We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
-workflow.
-For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
-(uncased_L-12_H-768_A-12).
-
-```shell
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export SQUAD_DIR=gs://some_bucket/datasets
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export SQUAD_VERSION=v1.1
-
-python run_squad.py \
-  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
-  --vocab_file=${BERT_DIR}/vocab.txt \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=4 \
-  --predict_batch_size=4 \
-  --learning_rate=8e-5 \
-  --num_train_epochs=2 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
-```
-
-Similarily, you can replace `init_checkpoint` FLAG with `hub_module_url` to
-specify a hub module path.
-
-`run_squad.py` writes the prediction for `--predict_file` by default. If you set
-the `--model=predict` and offer the SQuAD test data, the scripts will generate
-the prediction json file.
-
-To use TPU, you need switch distribution strategy type to `tpu` with TPU
-information.
-
-```shell
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export TPU_IP_ADDRESS='???'
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export SQUAD_DIR=gs://some_bucket/datasets
-export SQUAD_VERSION=v1.1
-
-python run_squad.py \
-  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
-  --vocab_file=${BERT_DIR}/vocab.txt \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=32 \
-  --learning_rate=8e-5 \
-  --num_train_epochs=2 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=tpu \
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-```
-
-The dev set predictions will be saved into a file called predictions.json in the
-model_dir:
-
-```shell
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
-```
-
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/__init__.py
deleted file mode 100644
index 3ef7bb85ba5f722a4f34e90623470d5a45af3aa4..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/bert_cloud_tpu.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/bert_cloud_tpu.md
deleted file mode 100644
index baf6f9bdc0c155cb53b30cea5f404aa166c3a2c6..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/bert_cloud_tpu.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# BERT FineTuning with Cloud TPU: Sentence and Sentence-Pair Classification Tasks (TF 2.1)
-This tutorial shows you how to train the Bidirectional Encoder Representations from Transformers (BERT) model on Cloud TPU.
-
-
-## Set up Cloud Storage and Compute Engine VM
-1. [Open a cloud shell window](https://console.cloud.google.com/?cloudshell=true&_ga=2.11844148.-1612541229.1552429951)
-2. Create a variable for the project's id:
-```
-export PROJECT_ID=your-project_id
-```
-3. Configure `gcloud` command-line tool to use the project where you want to create Cloud TPU.
-```
-gcloud config set project ${PROJECT_ID}
-```
-4. Create a Cloud Storage bucket using the following command:
-```
-gsutil mb -p ${PROJECT_ID} -c standard -l europe-west4 -b on gs://your-bucket-name
-```
-This Cloud Storage bucket stores the data you use to train your model and the training results.
-5. Launch a Compute Engine VM and Cloud TPU using the ctpu up command.
-```
-ctpu up --tpu-size=v3-8 \
- --machine-type=n1-standard-8 \
- --zone=europe-west4-a \
- --tf-version=2.1 [optional flags: --project, --name]
-```
-6. The configuration you specified appears. Enter y to approve or n to cancel.
-7. When the ctpu up command has finished executing, verify that your shell prompt has changed from username@project to username@tpuname. This change shows that you are now logged into your Compute Engine VM.
-```
-gcloud compute ssh vm-name --zone=europe-west4-a
-(vm)$ export TPU_NAME=vm-name
-```
-As you continue these instructions, run each command that begins with `(vm)$` in your VM session window.
-
-## Prepare the Dataset
-1. From your Compute Engine virtual machine (VM), install requirements.txt.
-```
-(vm)$ cd /usr/share/models
-(vm)$ sudo pip3 install -r official/requirements.txt
-```
-2. Optional: download download_glue_data.py
-
-This tutorial uses the General Language Understanding Evaluation (GLUE) benchmark to evaluate and analyze the performance of the model. The GLUE data is provided for this tutorial at gs://cloud-tpu-checkpoints/bert/classification.
-
-## Define parameter values
-Next, define several parameter values that are required when you train and evaluate your model:
-
-```
-(vm)$ export PYTHONPATH="$PYTHONPATH:/usr/share/tpu/models"
-(vm)$ export STORAGE_BUCKET=gs://your-bucket-name
-(vm)$ export BERT_BASE_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-(vm)$ export MODEL_DIR=${STORAGE_BUCKET}/bert-output
-(vm)$ export GLUE_DIR=gs://cloud-tpu-checkpoints/bert/classification
-(vm)$ export TASK=mnli
-```
-
-## Train the model
-From your Compute Engine VM, run the following command.
-
-```
-(vm)$ python3 official/nlp/bert/run_classifier.py \
-  --mode='train_and_eval' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
-  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
-  --train_batch_size=32 \
-  --eval_batch_size=32 \
-  --learning_rate=2e-5 \
-  --num_train_epochs=3 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=tpu \
-  --tpu=${TPU_NAME}
-```
-
-## Verify your results
-The training takes approximately 1 hour on a v3-8 TPU. When script completes, you should see results similar to the following:
-```
-Training Summary:
-{'train_loss': 0.28142181038856506,
-'last_train_metrics': 0.9467429518699646,
-'eval_metrics': 0.8599063158035278,
-'total_training_steps': 36813}
-```
-
-## Clean up
-To avoid incurring charges to your GCP account for the resources used in this topic:
-1. Disconnect from the Compute Engine VM:
-```
-(vm)$ exit
-```
-2. In your Cloud Shell, run ctpu delete with the --zone flag you used when you set up the Cloud TPU to delete your Compute Engine VM and your Cloud TPU:
-```
-$ ctpu delete --zone=your-zone
-```
-3. Run ctpu status specifying your zone to make sure you have no instances allocated to avoid unnecessary charges for TPU usage. The deletion might take several minutes. A response like the one below indicates there are no more allocated instances:
-```
-$ ctpu status --zone=your-zone
-```
-4. Run gsutil as shown, replacing your-bucket with the name of the Cloud Storage bucket you created for this tutorial:
-```
-$ gsutil rm -r gs://your-bucket
-```
-
-
-
-
-
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/bert_models.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/bert_models.py
deleted file mode 100644
index 8bfc10ac9064ec42126454f02666aeb9c7a88da6..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/bert_models.py
+++ /dev/null
@@ -1,382 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""BERT models that are compatible with TF 2.0."""
-
-import gin
-import tensorflow as tf
-import tensorflow_hub as hub
-
-from official.modeling import tf_utils
-from official.nlp.albert import configs as albert_configs
-from official.nlp.bert import configs
-from official.nlp.modeling import models
-from official.nlp.modeling import networks
-
-
-class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
-  """Returns layer that computes custom loss and metrics for pretraining."""
-
-  def __init__(self, vocab_size, **kwargs):
-    super(BertPretrainLossAndMetricLayer, self).__init__(**kwargs)
-    self._vocab_size = vocab_size
-    self.config = {
-        'vocab_size': vocab_size,
-    }
-
-  def _add_metrics(self, lm_output, lm_labels, lm_label_weights,
-                   lm_example_loss, sentence_output, sentence_labels,
-                   next_sentence_loss):
-    """Adds metrics."""
-    masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
-        lm_labels, lm_output)
-    numerator = tf.reduce_sum(masked_lm_accuracy * lm_label_weights)
-    denominator = tf.reduce_sum(lm_label_weights) + 1e-5
-    masked_lm_accuracy = numerator / denominator
-    self.add_metric(
-        masked_lm_accuracy, name='masked_lm_accuracy', aggregation='mean')
-
-    self.add_metric(lm_example_loss, name='lm_example_loss', aggregation='mean')
-
-    if sentence_labels is not None:
-      next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
-          sentence_labels, sentence_output)
-      self.add_metric(
-          next_sentence_accuracy,
-          name='next_sentence_accuracy',
-          aggregation='mean')
-
-    if next_sentence_loss is not None:
-      self.add_metric(
-          next_sentence_loss, name='next_sentence_loss', aggregation='mean')
-
-  def call(self,
-           lm_output_logits,
-           sentence_output_logits,
-           lm_label_ids,
-           lm_label_weights,
-           sentence_labels=None):
-    """Implements call() for the layer."""
-    lm_label_weights = tf.cast(lm_label_weights, tf.float32)
-    lm_output_logits = tf.cast(lm_output_logits, tf.float32)
-
-    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
-        lm_label_ids, lm_output_logits, from_logits=True)
-    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
-    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
-    mask_label_loss = tf.math.divide_no_nan(lm_numerator_loss,
-                                            lm_denominator_loss)
-
-    if sentence_labels is not None:
-      sentence_output_logits = tf.cast(sentence_output_logits, tf.float32)
-      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
-          sentence_labels, sentence_output_logits, from_logits=True)
-      sentence_loss = tf.reduce_mean(sentence_loss)
-      loss = mask_label_loss + sentence_loss
-    else:
-      sentence_loss = None
-      loss = mask_label_loss
-
-    batch_shape = tf.slice(tf.shape(lm_label_ids), [0], [1])
-    # TODO(hongkuny): Avoids the hack and switches add_loss.
-    final_loss = tf.fill(batch_shape, loss)
-
-    self._add_metrics(lm_output_logits, lm_label_ids, lm_label_weights,
-                      mask_label_loss, sentence_output_logits, sentence_labels,
-                      sentence_loss)
-    return final_loss
-
-
-@gin.configurable
-def get_transformer_encoder(bert_config,
-                            sequence_length=None,
-                            transformer_encoder_cls=None,
-                            output_range=None):
-  """Gets a 'TransformerEncoder' object.
-
-  Args:
-    bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object.
-    sequence_length: [Deprecated].
-    transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the
-      default BERT encoder implementation.
-    output_range: the sequence output range, [0, output_range). Default setting
-      is to return the entire sequence output.
-
-  Returns:
-    A encoder object.
-  """
-  del sequence_length
-  if transformer_encoder_cls is not None:
-    # TODO(hongkuny): evaluate if it is better to put cfg definition in gin.
-    embedding_cfg = dict(
-        vocab_size=bert_config.vocab_size,
-        type_vocab_size=bert_config.type_vocab_size,
-        hidden_size=bert_config.hidden_size,
-        max_seq_length=bert_config.max_position_embeddings,
-        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=bert_config.initializer_range),
-        dropout_rate=bert_config.hidden_dropout_prob,
-    )
-    hidden_cfg = dict(
-        num_attention_heads=bert_config.num_attention_heads,
-        intermediate_size=bert_config.intermediate_size,
-        intermediate_activation=tf_utils.get_activation(bert_config.hidden_act),
-        dropout_rate=bert_config.hidden_dropout_prob,
-        attention_dropout_rate=bert_config.attention_probs_dropout_prob,
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=bert_config.initializer_range),
-    )
-    kwargs = dict(
-        embedding_cfg=embedding_cfg,
-        hidden_cfg=hidden_cfg,
-        num_hidden_instances=bert_config.num_hidden_layers,
-        pooled_output_dim=bert_config.hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=bert_config.initializer_range))
-
-    # Relies on gin configuration to define the Transformer encoder arguments.
-    return transformer_encoder_cls(**kwargs)
-
-  kwargs = dict(
-      vocab_size=bert_config.vocab_size,
-      hidden_size=bert_config.hidden_size,
-      num_layers=bert_config.num_hidden_layers,
-      num_attention_heads=bert_config.num_attention_heads,
-      intermediate_size=bert_config.intermediate_size,
-      activation=tf_utils.get_activation(bert_config.hidden_act),
-      dropout_rate=bert_config.hidden_dropout_prob,
-      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
-      max_sequence_length=bert_config.max_position_embeddings,
-      type_vocab_size=bert_config.type_vocab_size,
-      embedding_width=bert_config.embedding_size,
-      initializer=tf.keras.initializers.TruncatedNormal(
-          stddev=bert_config.initializer_range))
-  if isinstance(bert_config, albert_configs.AlbertConfig):
-    return networks.AlbertEncoder(**kwargs)
-  else:
-    assert isinstance(bert_config, configs.BertConfig)
-    kwargs['output_range'] = output_range
-    return networks.BertEncoder(**kwargs)
-
-
-def pretrain_model(bert_config,
-                   seq_length,
-                   max_predictions_per_seq,
-                   initializer=None,
-                   use_next_sentence_label=True,
-                   return_core_pretrainer_model=False):
-  """Returns model to be used for pre-training.
-
-  Args:
-      bert_config: Configuration that defines the core BERT model.
-      seq_length: Maximum sequence length of the training data.
-      max_predictions_per_seq: Maximum number of tokens in sequence to mask out
-        and use for pretraining.
-      initializer: Initializer for weights in BertPretrainer.
-      use_next_sentence_label: Whether to use the next sentence label.
-      return_core_pretrainer_model: Whether to also return the `BertPretrainer`
-        object.
-
-  Returns:
-      A Tuple of (1) Pretraining model, (2) core BERT submodel from which to
-      save weights after pretraining, and (3) optional core `BertPretrainer`
-      object if argument `return_core_pretrainer_model` is True.
-  """
-  input_word_ids = tf.keras.layers.Input(
-      shape=(seq_length,), name='input_word_ids', dtype=tf.int32)
-  input_mask = tf.keras.layers.Input(
-      shape=(seq_length,), name='input_mask', dtype=tf.int32)
-  input_type_ids = tf.keras.layers.Input(
-      shape=(seq_length,), name='input_type_ids', dtype=tf.int32)
-  masked_lm_positions = tf.keras.layers.Input(
-      shape=(max_predictions_per_seq,),
-      name='masked_lm_positions',
-      dtype=tf.int32)
-  masked_lm_ids = tf.keras.layers.Input(
-      shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
-  masked_lm_weights = tf.keras.layers.Input(
-      shape=(max_predictions_per_seq,),
-      name='masked_lm_weights',
-      dtype=tf.int32)
-
-  if use_next_sentence_label:
-    next_sentence_labels = tf.keras.layers.Input(
-        shape=(1,), name='next_sentence_labels', dtype=tf.int32)
-  else:
-    next_sentence_labels = None
-
-  transformer_encoder = get_transformer_encoder(bert_config, seq_length)
-  if initializer is None:
-    initializer = tf.keras.initializers.TruncatedNormal(
-        stddev=bert_config.initializer_range)
-  pretrainer_model = models.BertPretrainer(
-      network=transformer_encoder,
-      embedding_table=transformer_encoder.get_embedding_table(),
-      num_classes=2,  # The next sentence prediction label has two classes.
-      activation=tf_utils.get_activation(bert_config.hidden_act),
-      num_token_predictions=max_predictions_per_seq,
-      initializer=initializer,
-      output='logits')
-
-  outputs = pretrainer_model(
-      [input_word_ids, input_mask, input_type_ids, masked_lm_positions])
-  lm_output = outputs['masked_lm']
-  sentence_output = outputs['classification']
-  pretrain_loss_layer = BertPretrainLossAndMetricLayer(
-      vocab_size=bert_config.vocab_size)
-  output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids,
-                                    masked_lm_weights, next_sentence_labels)
-  inputs = {
-      'input_word_ids': input_word_ids,
-      'input_mask': input_mask,
-      'input_type_ids': input_type_ids,
-      'masked_lm_positions': masked_lm_positions,
-      'masked_lm_ids': masked_lm_ids,
-      'masked_lm_weights': masked_lm_weights,
-  }
-  if use_next_sentence_label:
-    inputs['next_sentence_labels'] = next_sentence_labels
-
-  keras_model = tf.keras.Model(inputs=inputs, outputs=output_loss)
-  if return_core_pretrainer_model:
-    return keras_model, transformer_encoder, pretrainer_model
-  else:
-    return keras_model, transformer_encoder
-
-
-def squad_model(bert_config,
-                max_seq_length,
-                initializer=None,
-                hub_module_url=None,
-                hub_module_trainable=True):
-  """Returns BERT Squad model along with core BERT model to import weights.
-
-  Args:
-    bert_config: BertConfig, the config defines the core Bert model.
-    max_seq_length: integer, the maximum input sequence length.
-    initializer: Initializer for the final dense layer in the span labeler.
-      Defaulted to TruncatedNormal initializer.
-    hub_module_url: TF-Hub path/url to Bert module.
-    hub_module_trainable: True to finetune layers in the hub module.
-
-  Returns:
-    A tuple of (1) keras model that outputs start logits and end logits and
-    (2) the core BERT transformer encoder.
-  """
-  if initializer is None:
-    initializer = tf.keras.initializers.TruncatedNormal(
-        stddev=bert_config.initializer_range)
-  if not hub_module_url:
-    bert_encoder = get_transformer_encoder(bert_config, max_seq_length)
-    return models.BertSpanLabeler(
-        network=bert_encoder, initializer=initializer), bert_encoder
-
-  input_word_ids = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
-  input_mask = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
-  input_type_ids = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
-  core_model = hub.KerasLayer(hub_module_url, trainable=hub_module_trainable)
-  pooled_output, sequence_output = core_model(
-      [input_word_ids, input_mask, input_type_ids])
-  bert_encoder = tf.keras.Model(
-      inputs={
-          'input_word_ids': input_word_ids,
-          'input_mask': input_mask,
-          'input_type_ids': input_type_ids,
-      },
-      outputs=[sequence_output, pooled_output],
-      name='core_model')
-  return models.BertSpanLabeler(
-      network=bert_encoder, initializer=initializer), bert_encoder
-
-
-def classifier_model(bert_config,
-                     num_labels,
-                     max_seq_length=None,
-                     final_layer_initializer=None,
-                     hub_module_url=None,
-                     hub_module_trainable=True):
-  """BERT classifier model in functional API style.
-
-  Construct a Keras model for predicting `num_labels` outputs from an input with
-  maximum sequence length `max_seq_length`.
-
-  Args:
-    bert_config: BertConfig or AlbertConfig, the config defines the core BERT or
-      ALBERT model.
-    num_labels: integer, the number of classes.
-    max_seq_length: integer, the maximum input sequence length.
-    final_layer_initializer: Initializer for final dense layer. Defaulted
-      TruncatedNormal initializer.
-    hub_module_url: TF-Hub path/url to Bert module.
-    hub_module_trainable: True to finetune layers in the hub module.
-
-  Returns:
-    Combined prediction model (words, mask, type) -> (one-hot labels)
-    BERT sub-model (words, mask, type) -> (bert_outputs)
-  """
-  if final_layer_initializer is not None:
-    initializer = final_layer_initializer
-  else:
-    initializer = tf.keras.initializers.TruncatedNormal(
-        stddev=bert_config.initializer_range)
-
-  if not hub_module_url:
-    bert_encoder = get_transformer_encoder(
-        bert_config, max_seq_length, output_range=1)
-    return models.BertClassifier(
-        bert_encoder,
-        num_classes=num_labels,
-        dropout_rate=bert_config.hidden_dropout_prob,
-        initializer=initializer), bert_encoder
-
-  input_word_ids = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
-  input_mask = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
-  input_type_ids = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
-  bert_model = hub.KerasLayer(hub_module_url, trainable=hub_module_trainable)
-  pooled_output, _ = bert_model([input_word_ids, input_mask, input_type_ids])
-  output = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(
-      pooled_output)
-
-  output = tf.keras.layers.Dense(
-      num_labels, kernel_initializer=initializer, name='output')(
-          output)
-  return tf.keras.Model(
-      inputs={
-          'input_word_ids': input_word_ids,
-          'input_mask': input_mask,
-          'input_type_ids': input_type_ids
-      },
-      outputs=output), bert_model
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/bert_models_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/bert_models_test.py
deleted file mode 100644
index 03ee8abd238682da110fb7d3625e1754d4c85248..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/bert_models_test.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import tensorflow as tf
-
-from official.nlp.bert import bert_models
-from official.nlp.bert import configs as bert_configs
-from official.nlp.modeling import networks
-
-
-class BertModelsTest(tf.test.TestCase):
-
-  def setUp(self):
-    super(BertModelsTest, self).setUp()
-    self._bert_test_config = bert_configs.BertConfig(
-        attention_probs_dropout_prob=0.0,
-        hidden_act='gelu',
-        hidden_dropout_prob=0.0,
-        hidden_size=16,
-        initializer_range=0.02,
-        intermediate_size=32,
-        max_position_embeddings=128,
-        num_attention_heads=2,
-        num_hidden_layers=2,
-        type_vocab_size=2,
-        vocab_size=30522)
-
-  def test_pretrain_model(self):
-    model, encoder = bert_models.pretrain_model(
-        self._bert_test_config,
-        seq_length=5,
-        max_predictions_per_seq=2,
-        initializer=None,
-        use_next_sentence_label=True)
-    self.assertIsInstance(model, tf.keras.Model)
-    self.assertIsInstance(encoder, networks.BertEncoder)
-
-    # model has one scalar output: loss value.
-    self.assertEqual(model.output.shape.as_list(), [
-        None,
-    ])
-
-    # Expect two output from encoder: sequence and classification output.
-    self.assertIsInstance(encoder.output, list)
-    self.assertLen(encoder.output, 2)
-    # shape should be [batch size, hidden_size]
-    self.assertEqual(encoder.output[1].shape.as_list(), [None, 16])
-
-  def test_squad_model(self):
-    model, core_model = bert_models.squad_model(
-        self._bert_test_config,
-        max_seq_length=5,
-        initializer=None,
-        hub_module_url=None,
-        hub_module_trainable=None)
-    self.assertIsInstance(model, tf.keras.Model)
-    self.assertIsInstance(core_model, tf.keras.Model)
-
-    # Expect two output from model: start positions and end positions
-    self.assertIsInstance(model.output, list)
-    self.assertLen(model.output, 2)
-
-    # Expect two output from core_model: sequence and classification output.
-    self.assertIsInstance(core_model.output, list)
-    self.assertLen(core_model.output, 2)
-    # shape should be [batch size, None, hidden_size]
-    self.assertEqual(core_model.output[0].shape.as_list(), [None, None, 16])
-    # shape should be [batch size, hidden_size]
-    self.assertEqual(core_model.output[1].shape.as_list(), [None, 16])
-
-  def test_classifier_model(self):
-    model, core_model = bert_models.classifier_model(
-        self._bert_test_config,
-        num_labels=3,
-        max_seq_length=5,
-        final_layer_initializer=None,
-        hub_module_url=None,
-        hub_module_trainable=None)
-    self.assertIsInstance(model, tf.keras.Model)
-    self.assertIsInstance(core_model, tf.keras.Model)
-
-    # model has one classification output with num_labels=3.
-    self.assertEqual(model.output.shape.as_list(), [None, 3])
-
-    # Expect two output from core_model: sequence and classification output.
-    self.assertIsInstance(core_model.output, list)
-    self.assertLen(core_model.output, 2)
-    # shape should be [batch size, None, hidden_size]
-    self.assertEqual(core_model.output[0].shape.as_list(), [None, None, 16])
-    # shape should be [batch size, hidden_size]
-    self.assertEqual(core_model.output[1].shape.as_list(), [None, 16])
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/common_flags.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/common_flags.py
deleted file mode 100644
index c7242864721734f885384c52dbb0e4bb4bba8b97..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/common_flags.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Defining common flags used across all BERT models/applications."""
-
-from absl import flags
-import tensorflow as tf
-
-from official.utils import hyperparams_flags
-from official.utils.flags import core as flags_core
-
-
-def define_common_bert_flags():
-  """Define common flags for BERT tasks."""
-  flags_core.define_base(
-      data_dir=False,
-      model_dir=True,
-      clean=False,
-      train_epochs=False,
-      epochs_between_evals=False,
-      stop_threshold=False,
-      batch_size=False,
-      num_gpu=True,
-      export_dir=False,
-      distribution_strategy=True,
-      run_eagerly=True)
-  flags_core.define_distribution()
-  flags.DEFINE_string('bert_config_file', None,
-                      'Bert configuration file to define core bert layers.')
-  flags.DEFINE_string(
-      'model_export_path', None,
-      'Path to the directory, where trainined model will be '
-      'exported.')
-  flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
-  flags.DEFINE_string(
-      'init_checkpoint', None,
-      'Initial checkpoint (usually from a pre-trained BERT model).')
-  flags.DEFINE_integer('num_train_epochs', 3,
-                       'Total number of training epochs to perform.')
-  flags.DEFINE_integer(
-      'steps_per_loop', None,
-      'Number of steps per graph-mode loop. Only training step '
-      'happens inside the loop. Callbacks will not be called '
-      'inside. If not set the value will be configured depending on the '
-      'devices available.')
-  flags.DEFINE_float('learning_rate', 5e-5,
-                     'The initial learning rate for Adam.')
-  flags.DEFINE_float('end_lr', 0.0,
-                     'The end learning rate for learning rate decay.')
-  flags.DEFINE_string('optimizer_type', 'adamw',
-                      'The type of optimizer to use for training (adamw|lamb)')
-  flags.DEFINE_boolean(
-      'scale_loss', False,
-      'Whether to divide the loss by number of replica inside the per-replica '
-      'loss function.')
-  flags.DEFINE_boolean(
-      'use_keras_compile_fit', False,
-      'If True, uses Keras compile/fit() API for training logic. Otherwise '
-      'use custom training loop.')
-  flags.DEFINE_string(
-      'hub_module_url', None, 'TF-Hub path/url to Bert module. '
-      'If specified, init_checkpoint flag should not be used.')
-  flags.DEFINE_bool('hub_module_trainable', True,
-                    'True to make keras layers in the hub module trainable.')
-  flags.DEFINE_string(
-      'sub_model_export_name', None,
-      'If set, `sub_model` checkpoints are exported into '
-      'FLAGS.model_dir/FLAGS.sub_model_export_name.')
-  flags.DEFINE_bool('explicit_allreduce', False,
-                    'True to use explicit allreduce instead of the implicit '
-                    'allreduce in optimizer.apply_gradients(). If fp16 mixed '
-                    'precision training is used, this also enables allreduce '
-                    'gradients in fp16.')
-  flags.DEFINE_integer('allreduce_bytes_per_pack', 0,
-                       'Number of bytes of a gradient pack for allreduce. '
-                       'Should be positive integer, if set to 0, all '
-                       'gradients are in one pack. Breaking gradient into '
-                       'packs could enable overlap between allreduce and '
-                       'backprop computation. This flag only takes effect '
-                       'when explicit_allreduce is set to True.')
-
-  flags_core.define_log_steps()
-
-  # Adds flags for mixed precision and multi-worker training.
-  flags_core.define_performance(
-      num_parallel_calls=False,
-      inter_op=False,
-      intra_op=False,
-      synthetic_data=False,
-      max_train_steps=False,
-      dtype=True,
-      loss_scale=True,
-      all_reduce_alg=True,
-      num_packs=False,
-      tf_gpu_thread_mode=True,
-      datasets_num_private_threads=True,
-      enable_xla=True,
-      fp16_implementation=True,
-  )
-
-  # Adds gin configuration flags.
-  hyperparams_flags.define_gin_flags()
-
-
-def dtype():
-  return flags_core.get_tf_dtype(flags.FLAGS)
-
-
-def use_float16():
-  return flags_core.get_tf_dtype(flags.FLAGS) == tf.float16
-
-
-def use_graph_rewrite():
-  return flags.FLAGS.fp16_implementation == 'graph_rewrite'
-
-
-def get_loss_scale():
-  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16='dynamic')
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/configs.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/configs.py
deleted file mode 100644
index 1cb128d15e1858f85819a7621308d33bf781fb07..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/configs.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""The main BERT model and related functions."""
-
-import copy
-import json
-
-import six
-import tensorflow as tf
-
-
-class BertConfig(object):
-  """Configuration for `BertModel`."""
-
-  def __init__(self,
-               vocab_size,
-               hidden_size=768,
-               num_hidden_layers=12,
-               num_attention_heads=12,
-               intermediate_size=3072,
-               hidden_act="gelu",
-               hidden_dropout_prob=0.1,
-               attention_probs_dropout_prob=0.1,
-               max_position_embeddings=512,
-               type_vocab_size=16,
-               initializer_range=0.02,
-               embedding_size=None,
-               backward_compatible=True):
-    """Constructs BertConfig.
-
-    Args:
-      vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
-      hidden_size: Size of the encoder layers and the pooler layer.
-      num_hidden_layers: Number of hidden layers in the Transformer encoder.
-      num_attention_heads: Number of attention heads for each attention layer in
-        the Transformer encoder.
-      intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-        layer in the Transformer encoder.
-      hidden_act: The non-linear activation function (function or string) in the
-        encoder and pooler.
-      hidden_dropout_prob: The dropout probability for all fully connected
-        layers in the embeddings, encoder, and pooler.
-      attention_probs_dropout_prob: The dropout ratio for the attention
-        probabilities.
-      max_position_embeddings: The maximum sequence length that this model might
-        ever be used with. Typically set this to something large just in case
-        (e.g., 512 or 1024 or 2048).
-      type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-        `BertModel`.
-      initializer_range: The stdev of the truncated_normal_initializer for
-        initializing all weight matrices.
-      embedding_size: (Optional) width of the factorized word embeddings.
-      backward_compatible: Boolean, whether the variables shape are compatible
-        with checkpoints converted from TF 1.x BERT.
-    """
-    self.vocab_size = vocab_size
-    self.hidden_size = hidden_size
-    self.num_hidden_layers = num_hidden_layers
-    self.num_attention_heads = num_attention_heads
-    self.hidden_act = hidden_act
-    self.intermediate_size = intermediate_size
-    self.hidden_dropout_prob = hidden_dropout_prob
-    self.attention_probs_dropout_prob = attention_probs_dropout_prob
-    self.max_position_embeddings = max_position_embeddings
-    self.type_vocab_size = type_vocab_size
-    self.initializer_range = initializer_range
-    self.embedding_size = embedding_size
-    self.backward_compatible = backward_compatible
-
-  @classmethod
-  def from_dict(cls, json_object):
-    """Constructs a `BertConfig` from a Python dictionary of parameters."""
-    config = BertConfig(vocab_size=None)
-    for (key, value) in six.iteritems(json_object):
-      config.__dict__[key] = value
-    return config
-
-  @classmethod
-  def from_json_file(cls, json_file):
-    """Constructs a `BertConfig` from a json file of parameters."""
-    with tf.io.gfile.GFile(json_file, "r") as reader:
-      text = reader.read()
-    return cls.from_dict(json.loads(text))
-
-  def to_dict(self):
-    """Serializes this instance to a Python dictionary."""
-    output = copy.deepcopy(self.__dict__)
-    return output
-
-  def to_json_string(self):
-    """Serializes this instance to a JSON string."""
-    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/export_tfhub.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/export_tfhub.py
deleted file mode 100644
index 2d4fda1329e0b6a77713bbd94385f235bdbd47eb..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/export_tfhub.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A script to export BERT as a TF-Hub SavedModel.
-
-This script is **DEPRECATED** for exporting BERT encoder models;
-see the error message in by main() for details.
-"""
-
-from typing import Text
-
-# Import libraries
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-from official.nlp.bert import bert_models
-from official.nlp.bert import configs
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("bert_config_file", None,
-                    "Bert configuration file to define core bert layers.")
-flags.DEFINE_string("model_checkpoint_path", None,
-                    "File path to TF model checkpoint.")
-flags.DEFINE_string("export_path", None, "TF-Hub SavedModel destination path.")
-flags.DEFINE_string("vocab_file", None,
-                    "The vocabulary file that the BERT model was trained on.")
-flags.DEFINE_bool(
-    "do_lower_case", None, "Whether to lowercase. If None, "
-    "do_lower_case will be enabled if 'uncased' appears in the "
-    "name of --vocab_file")
-flags.DEFINE_enum("model_type", "encoder", ["encoder", "squad"],
-                  "What kind of BERT model to export.")
-
-
-def create_bert_model(bert_config: configs.BertConfig) -> tf.keras.Model:
-  """Creates a BERT keras core model from BERT configuration.
-
-  Args:
-    bert_config: A `BertConfig` to create the core model.
-
-  Returns:
-    A keras model.
-  """
-  # Adds input layers just as placeholders.
-  input_word_ids = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name="input_word_ids")
-  input_mask = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name="input_mask")
-  input_type_ids = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name="input_type_ids")
-  transformer_encoder = bert_models.get_transformer_encoder(
-      bert_config, sequence_length=None)
-  sequence_output, pooled_output = transformer_encoder(
-      [input_word_ids, input_mask, input_type_ids])
-  # To keep consistent with legacy hub modules, the outputs are
-  # "pooled_output" and "sequence_output".
-  return tf.keras.Model(
-      inputs=[input_word_ids, input_mask, input_type_ids],
-      outputs=[pooled_output, sequence_output]), transformer_encoder
-
-
-def export_bert_tfhub(bert_config: configs.BertConfig,
-                      model_checkpoint_path: Text,
-                      hub_destination: Text,
-                      vocab_file: Text,
-                      do_lower_case: bool = None):
-  """Restores a tf.keras.Model and saves for TF-Hub."""
-  # If do_lower_case is not explicit, default to checking whether "uncased" is
-  # in the vocab file name
-  if do_lower_case is None:
-    do_lower_case = "uncased" in vocab_file
-    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
-                 do_lower_case, vocab_file)
-  core_model, encoder = create_bert_model(bert_config)
-  checkpoint = tf.train.Checkpoint(
-      model=encoder,  # Legacy checkpoints.
-      encoder=encoder)
-  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
-  core_model.vocab_file = tf.saved_model.Asset(vocab_file)
-  core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
-  core_model.save(hub_destination, include_optimizer=False, save_format="tf")
-
-
-def export_bert_squad_tfhub(bert_config: configs.BertConfig,
-                            model_checkpoint_path: Text,
-                            hub_destination: Text,
-                            vocab_file: Text,
-                            do_lower_case: bool = None):
-  """Restores a tf.keras.Model for BERT with SQuAD and saves for TF-Hub."""
-  # If do_lower_case is not explicit, default to checking whether "uncased" is
-  # in the vocab file name
-  if do_lower_case is None:
-    do_lower_case = "uncased" in vocab_file
-    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
-                 do_lower_case, vocab_file)
-  span_labeling, _ = bert_models.squad_model(bert_config, max_seq_length=None)
-  checkpoint = tf.train.Checkpoint(model=span_labeling)
-  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
-  span_labeling.vocab_file = tf.saved_model.Asset(vocab_file)
-  span_labeling.do_lower_case = tf.Variable(do_lower_case, trainable=False)
-  span_labeling.save(hub_destination, include_optimizer=False, save_format="tf")
-
-
-def main(_):
-  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-  if FLAGS.model_type == "encoder":
-    deprecation_note = (
-        "nlp/bert/export_tfhub is **DEPRECATED** for exporting BERT encoder "
-        "models. Please switch to nlp/tools/export_tfhub for exporting BERT "
-        "(and other) encoders with dict inputs/outputs conforming to "
-        "https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders"
-    )
-    logging.error(deprecation_note)
-    print("\n\nNOTICE:", deprecation_note, "\n")
-    export_bert_tfhub(bert_config, FLAGS.model_checkpoint_path,
-                      FLAGS.export_path, FLAGS.vocab_file, FLAGS.do_lower_case)
-  elif FLAGS.model_type == "squad":
-    export_bert_squad_tfhub(bert_config, FLAGS.model_checkpoint_path,
-                            FLAGS.export_path, FLAGS.vocab_file,
-                            FLAGS.do_lower_case)
-  else:
-    raise ValueError("Unsupported model_type %s." % FLAGS.model_type)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/export_tfhub_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/export_tfhub_test.py
deleted file mode 100644
index 9a5c1f9ad36d1f756488544a237b0a5e16ae6a07..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/export_tfhub_test.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests official.nlp.bert.export_tfhub."""
-
-import os
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-import tensorflow_hub as hub
-
-from official.nlp.bert import configs
-from official.nlp.bert import export_tfhub
-
-
-class ExportTfhubTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters("model", "encoder")
-  def test_export_tfhub(self, ckpt_key_name):
-    # Exports a savedmodel for TF-Hub
-    hidden_size = 16
-    bert_config = configs.BertConfig(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        intermediate_size=32,
-        max_position_embeddings=128,
-        num_attention_heads=2,
-        num_hidden_layers=1)
-    bert_model, encoder = export_tfhub.create_bert_model(bert_config)
-    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-    checkpoint = tf.train.Checkpoint(**{ckpt_key_name: encoder})
-    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
-    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
-
-    vocab_file = os.path.join(self.get_temp_dir(), "uncased_vocab.txt")
-    with tf.io.gfile.GFile(vocab_file, "w") as f:
-      f.write("dummy content")
-
-    hub_destination = os.path.join(self.get_temp_dir(), "hub")
-    export_tfhub.export_bert_tfhub(bert_config, model_checkpoint_path,
-                                   hub_destination, vocab_file)
-
-    # Restores a hub KerasLayer.
-    hub_layer = hub.KerasLayer(hub_destination, trainable=True)
-
-    if hasattr(hub_layer, "resolved_object"):
-      # Checks meta attributes.
-      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
-      with tf.io.gfile.GFile(
-          hub_layer.resolved_object.vocab_file.asset_path.numpy()) as f:
-        self.assertEqual("dummy content", f.read())
-    # Checks the hub KerasLayer.
-    for source_weight, hub_weight in zip(bert_model.trainable_weights,
-                                         hub_layer.trainable_weights):
-      self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
-
-    seq_length = 10
-    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
-    hub_outputs = hub_layer([dummy_ids, dummy_ids, dummy_ids])
-    source_outputs = bert_model([dummy_ids, dummy_ids, dummy_ids])
-
-    # The outputs of hub module are "pooled_output" and "sequence_output",
-    # while the outputs of encoder is in reversed order, i.e.,
-    # "sequence_output" and "pooled_output".
-    encoder_outputs = reversed(encoder([dummy_ids, dummy_ids, dummy_ids]))
-    self.assertEqual(hub_outputs[0].shape, (2, hidden_size))
-    self.assertEqual(hub_outputs[1].shape, (2, seq_length, hidden_size))
-    for source_output, hub_output, encoder_output in zip(
-        source_outputs, hub_outputs, encoder_outputs):
-      self.assertAllClose(source_output.numpy(), hub_output.numpy())
-      self.assertAllClose(source_output.numpy(), encoder_output.numpy())
-
-    # Test that training=True makes a difference (activates dropout).
-    def _dropout_mean_stddev(training, num_runs=20):
-      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
-      inputs = [input_ids, np.ones_like(input_ids), np.zeros_like(input_ids)]
-      outputs = np.concatenate(
-          [hub_layer(inputs, training=training)[0] for _ in range(num_runs)])
-      return np.mean(np.std(outputs, axis=0))
-
-    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
-    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
-
-    # Test propagation of seq_length in shape inference.
-    input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    pooled_output, sequence_output = hub_layer(
-        [input_word_ids, input_mask, input_type_ids])
-    self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size])
-    self.assertEqual(sequence_output.shape.as_list(),
-                     [None, seq_length, hidden_size])
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/input_pipeline.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/input_pipeline.py
deleted file mode 100644
index 6e0d05afa42957fb8a2a35fa77cf77426d24fad8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/input_pipeline.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""BERT model input pipelines."""
-
-import tensorflow as tf
-
-
-def decode_record(record, name_to_features):
-  """Decodes a record to a TensorFlow example."""
-  example = tf.io.parse_single_example(record, name_to_features)
-
-  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-  # So cast all int64 to int32.
-  for name in list(example.keys()):
-    t = example[name]
-    if t.dtype == tf.int64:
-      t = tf.cast(t, tf.int32)
-    example[name] = t
-
-  return example
-
-
-def single_file_dataset(input_file, name_to_features, num_samples=None):
-  """Creates a single-file dataset to be passed for BERT custom training."""
-  # For training, we want a lot of parallel reading and shuffling.
-  # For eval, we want no shuffling and parallel reading doesn't matter.
-  d = tf.data.TFRecordDataset(input_file)
-  if num_samples:
-    d = d.take(num_samples)
-  d = d.map(
-      lambda record: decode_record(record, name_to_features),
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-  # When `input_file` is a path to a single file or a list
-  # containing a single path, disable auto sharding so that
-  # same input file is sent to all workers.
-  if isinstance(input_file, str) or len(input_file) == 1:
-    options = tf.data.Options()
-    options.experimental_distribute.auto_shard_policy = (
-        tf.data.experimental.AutoShardPolicy.OFF)
-    d = d.with_options(options)
-  return d
-
-
-def create_pretrain_dataset(input_patterns,
-                            seq_length,
-                            max_predictions_per_seq,
-                            batch_size,
-                            is_training=True,
-                            input_pipeline_context=None,
-                            use_next_sentence_label=True,
-                            use_position_id=False,
-                            output_fake_labels=True):
-  """Creates input dataset from (tf)records files for pretraining."""
-  name_to_features = {
-      'input_ids':
-          tf.io.FixedLenFeature([seq_length], tf.int64),
-      'input_mask':
-          tf.io.FixedLenFeature([seq_length], tf.int64),
-      'segment_ids':
-          tf.io.FixedLenFeature([seq_length], tf.int64),
-      'masked_lm_positions':
-          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
-      'masked_lm_ids':
-          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
-      'masked_lm_weights':
-          tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
-  }
-  if use_next_sentence_label:
-    name_to_features['next_sentence_labels'] = tf.io.FixedLenFeature([1],
-                                                                     tf.int64)
-  if use_position_id:
-    name_to_features['position_ids'] = tf.io.FixedLenFeature([seq_length],
-                                                             tf.int64)
-  for input_pattern in input_patterns:
-    if not tf.io.gfile.glob(input_pattern):
-      raise ValueError('%s does not match any files.' % input_pattern)
-
-  dataset = tf.data.Dataset.list_files(input_patterns, shuffle=is_training)
-
-  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
-    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
-                            input_pipeline_context.input_pipeline_id)
-  if is_training:
-    dataset = dataset.repeat()
-
-    # We set shuffle buffer to exactly match total number of
-    # training files to ensure that training data is well shuffled.
-    input_files = []
-    for input_pattern in input_patterns:
-      input_files.extend(tf.io.gfile.glob(input_pattern))
-    dataset = dataset.shuffle(len(input_files))
-
-  # In parallel, create tf record dataset for each train files.
-  # cycle_length = 8 means that up to 8 files will be read and deserialized in
-  # parallel. You may want to increase this number if you have a large number of
-  # CPU cores.
-  dataset = dataset.interleave(
-      tf.data.TFRecordDataset,
-      cycle_length=8,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-  if is_training:
-    dataset = dataset.shuffle(100)
-
-  decode_fn = lambda record: decode_record(record, name_to_features)
-  dataset = dataset.map(
-      decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-  def _select_data_from_record(record):
-    """Filter out features to use for pretraining."""
-    x = {
-        'input_word_ids': record['input_ids'],
-        'input_mask': record['input_mask'],
-        'input_type_ids': record['segment_ids'],
-        'masked_lm_positions': record['masked_lm_positions'],
-        'masked_lm_ids': record['masked_lm_ids'],
-        'masked_lm_weights': record['masked_lm_weights'],
-    }
-    if use_next_sentence_label:
-      x['next_sentence_labels'] = record['next_sentence_labels']
-    if use_position_id:
-      x['position_ids'] = record['position_ids']
-
-    # TODO(hongkuny): Remove the fake labels after migrating bert pretraining.
-    if output_fake_labels:
-      return (x, record['masked_lm_weights'])
-    else:
-      return x
-
-  dataset = dataset.map(
-      _select_data_from_record,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  dataset = dataset.batch(batch_size, drop_remainder=is_training)
-  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-  return dataset
-
-
-def create_classifier_dataset(file_path,
-                              seq_length,
-                              batch_size,
-                              is_training=True,
-                              input_pipeline_context=None,
-                              label_type=tf.int64,
-                              include_sample_weights=False,
-                              num_samples=None):
-  """Creates input dataset from (tf)records files for train/eval."""
-  name_to_features = {
-      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'label_ids': tf.io.FixedLenFeature([], label_type),
-  }
-  if include_sample_weights:
-    name_to_features['weight'] = tf.io.FixedLenFeature([], tf.float32)
-  dataset = single_file_dataset(file_path, name_to_features,
-                                num_samples=num_samples)
-
-  # The dataset is always sharded by number of hosts.
-  # num_input_pipelines is the number of hosts rather than number of cores.
-  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
-    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
-                            input_pipeline_context.input_pipeline_id)
-
-  def _select_data_from_record(record):
-    x = {
-        'input_word_ids': record['input_ids'],
-        'input_mask': record['input_mask'],
-        'input_type_ids': record['segment_ids']
-    }
-    y = record['label_ids']
-    if include_sample_weights:
-      w = record['weight']
-      return (x, y, w)
-    return (x, y)
-
-  if is_training:
-    dataset = dataset.shuffle(100)
-    dataset = dataset.repeat()
-
-  dataset = dataset.map(
-      _select_data_from_record,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  dataset = dataset.batch(batch_size, drop_remainder=is_training)
-  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-  return dataset
-
-
-def create_squad_dataset(file_path,
-                         seq_length,
-                         batch_size,
-                         is_training=True,
-                         input_pipeline_context=None):
-  """Creates input dataset from (tf)records files for train/eval."""
-  name_to_features = {
-      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-  }
-  if is_training:
-    name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
-    name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
-  else:
-    name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
-
-  dataset = single_file_dataset(file_path, name_to_features)
-
-  # The dataset is always sharded by number of hosts.
-  # num_input_pipelines is the number of hosts rather than number of cores.
-  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
-    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
-                            input_pipeline_context.input_pipeline_id)
-
-  def _select_data_from_record(record):
-    """Dispatches record to features and labels."""
-    x, y = {}, {}
-    for name, tensor in record.items():
-      if name in ('start_positions', 'end_positions'):
-        y[name] = tensor
-      elif name == 'input_ids':
-        x['input_word_ids'] = tensor
-      elif name == 'segment_ids':
-        x['input_type_ids'] = tensor
-      else:
-        x[name] = tensor
-    return (x, y)
-
-  if is_training:
-    dataset = dataset.shuffle(100)
-    dataset = dataset.repeat()
-
-  dataset = dataset.map(
-      _select_data_from_record,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  dataset = dataset.batch(batch_size, drop_remainder=True)
-  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-  return dataset
-
-
-def create_retrieval_dataset(file_path,
-                             seq_length,
-                             batch_size,
-                             input_pipeline_context=None):
-  """Creates input dataset from (tf)records files for scoring."""
-  name_to_features = {
-      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'example_id': tf.io.FixedLenFeature([1], tf.int64),
-  }
-  dataset = single_file_dataset(file_path, name_to_features)
-
-  # The dataset is always sharded by number of hosts.
-  # num_input_pipelines is the number of hosts rather than number of cores.
-  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
-    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
-                            input_pipeline_context.input_pipeline_id)
-
-  def _select_data_from_record(record):
-    x = {
-        'input_word_ids': record['input_ids'],
-        'input_mask': record['input_mask'],
-        'input_type_ids': record['segment_ids']
-    }
-    y = record['example_id']
-    return (x, y)
-
-  dataset = dataset.map(
-      _select_data_from_record,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  dataset = dataset.batch(batch_size, drop_remainder=False)
-
-  def _pad_to_batch(x, y):
-    cur_size = tf.shape(y)[0]
-    pad_size = batch_size - cur_size
-
-    pad_ids = tf.zeros(shape=[pad_size, seq_length], dtype=tf.int32)
-    for key in ('input_word_ids', 'input_mask', 'input_type_ids'):
-      x[key] = tf.concat([x[key], pad_ids], axis=0)
-
-    pad_labels = -tf.ones(shape=[pad_size, 1], dtype=tf.int32)
-    y = tf.concat([y, pad_labels], axis=0)
-    return x, y
-
-  dataset = dataset.map(
-      _pad_to_batch,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-  return dataset
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_saving_utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_saving_utils.py
deleted file mode 100644
index 141ebb2c4e347c89b5dacc4ed6fb105a3e3b3017..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_saving_utils.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Utilities to save models."""
-
-import os
-
-from absl import logging
-import tensorflow as tf
-import typing
-
-
-def export_bert_model(model_export_path: typing.Text,
-                      model: tf.keras.Model,
-                      checkpoint_dir: typing.Optional[typing.Text] = None,
-                      restore_model_using_load_weights: bool = False) -> None:
-  """Export BERT model for serving which does not include the optimizer.
-
-  Args:
-      model_export_path: Path to which exported model will be saved.
-      model: Keras model object to export.
-      checkpoint_dir: Path from which model weights will be loaded, if
-        specified.
-      restore_model_using_load_weights: Whether to use checkpoint.restore() API
-        for custom checkpoint or to use model.load_weights() API. There are 2
-        different ways to save checkpoints. One is using tf.train.Checkpoint and
-        another is using Keras model.save_weights(). Custom training loop
-        implementation uses tf.train.Checkpoint API and Keras ModelCheckpoint
-        callback internally uses model.save_weights() API. Since these two API's
-        cannot be used toghether, model loading logic must be take into account
-        how model checkpoint was saved.
-
-  Raises:
-    ValueError when either model_export_path or model is not specified.
-  """
-  if not model_export_path:
-    raise ValueError('model_export_path must be specified.')
-  if not isinstance(model, tf.keras.Model):
-    raise ValueError('model must be a tf.keras.Model object.')
-
-  if checkpoint_dir:
-    if restore_model_using_load_weights:
-      model_weight_path = os.path.join(checkpoint_dir, 'checkpoint')
-      assert tf.io.gfile.exists(model_weight_path)
-      model.load_weights(model_weight_path)
-    else:
-      checkpoint = tf.train.Checkpoint(model=model)
-
-      # Restores the model from latest checkpoint.
-      latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
-      assert latest_checkpoint_file
-      logging.info('Checkpoint file %s found and restoring from '
-                   'checkpoint', latest_checkpoint_file)
-      checkpoint.restore(
-          latest_checkpoint_file).assert_existing_objects_matched()
-
-  model.save(model_export_path, include_optimizer=False, save_format='tf')
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_training_utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_training_utils.py
deleted file mode 100644
index 189021ecb5e59446bb69bca6c5e17cbf05c4e47c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_training_utils.py
+++ /dev/null
@@ -1,607 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A light weight utilities to train NLP models."""
-
-import json
-import os
-import tempfile
-from absl import logging
-import tensorflow as tf
-from tensorflow.python.util import deprecation
-from official.common import distribute_utils
-from official.staging.training import grad_utils
-
-_SUMMARY_TXT = 'training_summary.txt'
-_MIN_SUMMARY_STEPS = 10
-
-
-def _should_export_checkpoint(strategy):
-  return (not strategy) or strategy.extended.should_checkpoint
-
-
-def _should_export_summary(strategy):
-  return (not strategy) or strategy.extended.should_save_summary
-
-
-def _save_checkpoint(strategy, checkpoint, model_dir, checkpoint_prefix):
-  """Saves model to with provided checkpoint prefix."""
-
-  if _should_export_checkpoint(strategy):
-    checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
-    saved_path = checkpoint.save(checkpoint_path)
-    logging.info('Saving model as TF checkpoint: %s', saved_path)
-  else:
-    # In multi worker training we need every worker to save checkpoint, because
-    # variables can trigger synchronization on read and synchronization needs
-    # all workers to participate. To avoid workers overriding each other we save
-    # to a temporary directory on non-chief workers.
-    tmp_dir = tempfile.mkdtemp()
-    checkpoint.save(os.path.join(tmp_dir, 'ckpt'))
-    tf.io.gfile.rmtree(tmp_dir)
-  return
-
-
-def _get_input_iterator(input_fn, strategy):
-  """Returns distributed dataset iterator."""
-  # When training with TPU pods, datasets needs to be cloned across
-  # workers. Since Dataset instance cannot be cloned in eager mode, we instead
-  # pass callable that returns a dataset.
-  if not callable(input_fn):
-    raise ValueError('`input_fn` should be a closure that returns a dataset.')
-  iterator = iter(strategy.distribute_datasets_from_function(input_fn))
-  return iterator
-
-
-def _float_metric_value(metric):
-  """Gets the value of a float-value keras metric."""
-  return metric.result().numpy().astype(float)
-
-
-def clip_by_global_norm_callback(grads_and_vars):
-  """Performs gradient clipping."""
-  grads, variables = zip(*grads_and_vars)
-  (clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
-  return zip(clipped_grads, variables)
-
-
-def steps_to_run(current_step, steps_per_epoch, steps_per_loop):
-  """Calculates steps to run on device."""
-  if steps_per_loop <= 0:
-    raise ValueError('steps_per_loop should be positive integer.')
-  if steps_per_loop == 1:
-    return steps_per_loop
-  remainder_in_epoch = current_step % steps_per_epoch
-  if remainder_in_epoch != 0:
-    return min(steps_per_epoch - remainder_in_epoch, steps_per_loop)
-  else:
-    return steps_per_loop
-
-
-def write_txt_summary(training_summary, summary_dir):
-  """Writes a summary text file to record stats."""
-  if not tf.io.gfile.exists(summary_dir):
-    tf.io.gfile.mkdir(summary_dir)
-  summary_path = os.path.join(summary_dir, _SUMMARY_TXT)
-  with tf.io.gfile.GFile(summary_path, 'wb') as f:
-    logging.info('Training Summary: \n%s', str(training_summary))
-    f.write(json.dumps(training_summary, indent=4))
-
-
-@deprecation.deprecated(
-    None, 'This function is deprecated and we do not expect adding new '
-    'functionalities. Please do not have your code depending '
-    'on this library.')
-def run_customized_training_loop(
-    # pylint: disable=invalid-name
-    _sentinel=None,
-    # pylint: enable=invalid-name
-    strategy=None,
-    model_fn=None,
-    loss_fn=None,
-    scale_loss=True,
-    model_dir=None,
-    train_input_fn=None,
-    steps_per_epoch=None,
-    num_eval_per_epoch=1,
-    steps_per_loop=None,
-    epochs=1,
-    eval_input_fn=None,
-    eval_steps=None,
-    metric_fn=None,
-    init_checkpoint=None,
-    custom_callbacks=None,
-    run_eagerly=False,
-    sub_model_export_name=None,
-    explicit_allreduce=False,
-    pre_allreduce_callbacks=None,
-    post_allreduce_callbacks=None,
-    train_summary_interval=0,
-    allreduce_bytes_per_pack=0):
-  """Run BERT pretrain model training using low-level API.
-
-  Args:
-      _sentinel: Used to prevent positional parameters. Internal, do not use.
-      strategy: Distribution strategy on which to run low level training loop.
-      model_fn: Function that returns a tuple (model, sub_model). Caller of this
-        function should add optimizer to the `model` via calling
-        `model.compile()` API or manually setting `model.optimizer` attribute.
-        Second element of the returned tuple(sub_model) is an optional sub model
-        to be used for initial checkpoint -- if provided.
-      loss_fn: Function with signature func(labels, logits) and returns a loss
-        tensor.
-      scale_loss: Whether to divide the raw loss by number of replicas before
-        gradients calculation.
-      model_dir: Model directory used during training for restoring/saving model
-        weights.
-      train_input_fn: Function that returns a tf.data.Dataset used for training.
-      steps_per_epoch: Number of steps to run per epoch. At the end of each
-        epoch, model checkpoint will be saved and evaluation will be conducted
-        if evaluation dataset is provided.
-      num_eval_per_epoch: Number of evaluations per epoch.
-      steps_per_loop: Number of steps per graph-mode loop. In order to reduce
-        communication in eager context, training logs are printed every
-        steps_per_loop.
-      epochs: Number of epochs to train.
-      eval_input_fn: Function that returns evaluation dataset. If none,
-        evaluation is skipped.
-      eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
-        is not none.
-      metric_fn: A metrics function that returns either a Keras Metric object or
-        a list of Keras Metric objects to record evaluation result using
-        evaluation dataset or with training dataset after every epoch.
-      init_checkpoint: Optional checkpoint to load to `sub_model` returned by
-        `model_fn`.
-      custom_callbacks: A list of Keras Callbacks objects to run during
-        training. More specifically, `on_train_begin(), on_train_end(),
-        on_batch_begin()`, `on_batch_end()`, `on_epoch_begin()`,
-        `on_epoch_end()` methods are invoked during training. Note that some
-        metrics may be missing from `logs`.
-      run_eagerly: Whether to run model training in pure eager execution. This
-        should be disable for TPUStrategy.
-      sub_model_export_name: If not None, will export `sub_model` returned by
-        `model_fn` into checkpoint files. The name of intermediate checkpoint
-        file is {sub_model_export_name}_step_{step}.ckpt and the last
-        checkpint's name is {sub_model_export_name}.ckpt; if None, `sub_model`
-        will not be exported as checkpoint.
-      explicit_allreduce: Whether to explicitly perform gradient allreduce,
-        instead of relying on implicit allreduce in optimizer.apply_gradients().
-        default is False. For now, if training using FP16 mixed precision,
-        explicit allreduce will aggregate gradients in FP16 format. For TPU and
-        GPU training using FP32, explicit allreduce will aggregate gradients in
-        FP32 format.
-      pre_allreduce_callbacks: A list of callback functions that takes gradients
-        and model variables pairs as input, manipulate them, and returns a new
-        gradients and model variables paris. The callback functions will be
-        invoked in the list order and before gradients are allreduced. With
-        mixed precision training, the pre_allreduce_allbacks will be applied on
-        scaled_gradients. Default is no callbacks. Only used when
-        explicit_allreduce=True.
-      post_allreduce_callbacks: A list of callback functions that takes
-        gradients and model variables pairs as input, manipulate them, and
-        returns a new gradients and model variables paris. The callback
-        functions will be invoked in the list order and right before gradients
-        are applied to variables for updates. Default is no callbacks. Only used
-        when explicit_allreduce=True.
-      train_summary_interval: Step interval for training summaries. If the value
-        is a negative number, then training summaries are not enabled.
-      allreduce_bytes_per_pack: A non-negative integer. Breaks collective
-        operations into packs of certain size. If it's zero, all gradients are
-        in one pack. Breaking gradient into packs could enable overlap between
-        allreduce and backprop computation. This flag only takes effect when
-        explicit_allreduce is set to True.'
-
-  Returns:
-      Trained model.
-
-  Raises:
-      ValueError: (1) When model returned by `model_fn` does not have optimizer
-        attribute or when required parameters are set to none. (2) eval args are
-        not specified correctly. (3) metric_fn must be a callable if specified.
-        (4) sub_model_checkpoint_name is specified, but `sub_model` returned
-        by `model_fn` is None.
-  """
-
-  if _sentinel is not None:
-    raise ValueError('only call `run_customized_training_loop()` '
-                     'with named arguments.')
-
-  required_arguments = [
-      strategy, model_fn, loss_fn, model_dir, steps_per_epoch, train_input_fn
-  ]
-
-  steps_between_evals = int(steps_per_epoch / num_eval_per_epoch)
-  if [arg for arg in required_arguments if arg is None]:
-    raise ValueError('`strategy`, `model_fn`, `loss_fn`, `model_dir`, '
-                     '`steps_per_epoch` and `train_input_fn` are required '
-                     'parameters.')
-  if not steps_per_loop:
-    if tf.config.list_logical_devices('TPU'):
-      # One can't fully utilize a TPU with steps_per_loop=1, so in this case
-      # default users to a more useful value.
-      steps_per_loop = min(1000, steps_between_evals)
-    else:
-      steps_per_loop = 1
-    logging.info('steps_per_loop not specified. Using steps_per_loop=%d',
-                 steps_per_loop)
-  if steps_per_loop > steps_between_evals:
-    logging.warning(
-        'steps_per_loop: %d is specified to be greater than '
-        ' steps_between_evals: %d, we will use steps_between_evals as'
-        ' steps_per_loop.', steps_per_loop, steps_between_evals)
-    steps_per_loop = steps_between_evals
-  assert tf.executing_eagerly()
-
-  if run_eagerly:
-    if isinstance(
-        strategy,
-        (tf.distribute.TPUStrategy, tf.distribute.experimental.TPUStrategy)):
-      raise ValueError(
-          'TPUStrategy should not run eagerly as it heavily relies on graph'
-          ' optimization for the distributed system.')
-
-  if eval_input_fn and eval_steps is None:
-    raise ValueError(
-        '`eval_step` is required when `eval_input_fn ` is not none.')
-  if metric_fn and not callable(metric_fn):
-    raise ValueError(
-        'if `metric_fn` is specified, metric_fn must be a callable.')
-
-  total_training_steps = steps_per_epoch * epochs
-  train_iterator = _get_input_iterator(train_input_fn, strategy)
-  eval_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
-
-  with distribute_utils.get_strategy_scope(strategy):
-    # To correctly place the model weights on accelerators,
-    # model and optimizer should be created in scope.
-    model, sub_model = model_fn()
-    if not hasattr(model, 'optimizer'):
-      raise ValueError('User should set optimizer attribute to model '
-                       'inside `model_fn`.')
-    if sub_model_export_name and sub_model is None:
-      raise ValueError('sub_model_export_name is specified as %s, but '
-                       'sub_model is None.' % sub_model_export_name)
-
-    callback_list = tf.keras.callbacks.CallbackList(
-        callbacks=custom_callbacks, model=model)
-
-    optimizer = model.optimizer
-
-    if init_checkpoint:
-      logging.info(
-          'Checkpoint file %s found and restoring from '
-          'initial checkpoint for core model.', init_checkpoint)
-      checkpoint = tf.train.Checkpoint(model=sub_model, encoder=sub_model)
-      checkpoint.restore(init_checkpoint).expect_partial().assert_existing_objects_matched()
-      logging.info('Loading from checkpoint file completed')
-
-    train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
-    eval_metrics = metric_fn() if metric_fn else []
-    if not isinstance(eval_metrics, list):
-      eval_metrics = [eval_metrics]
-    # If evaluation is required, make a copy of metric as it will be used by
-    # both train and evaluation.
-    train_metrics = [
-        metric.__class__.from_config(metric.get_config())
-        for metric in eval_metrics
-    ]
-
-    # Create summary writers
-    if _should_export_summary(strategy):
-      summary_dir = os.path.join(model_dir, 'summaries')
-    else:
-      # In multi worker training we need every worker to write summary, because
-      # variables can trigger synchronization on read and synchronization needs
-      # all workers to participate.
-      summary_dir = tempfile.mkdtemp()
-    eval_summary_writer = tf.summary.create_file_writer(
-        os.path.join(summary_dir, 'eval'))
-    last_summary_step = 0
-    if steps_per_loop >= _MIN_SUMMARY_STEPS and train_summary_interval >= 0:
-      # Only writes summary when the stats are collected sufficiently over
-      # enough steps.
-      train_summary_writer = tf.summary.create_file_writer(
-          os.path.join(summary_dir, 'train'))
-    else:
-      train_summary_writer = tf.summary.create_noop_writer()
-
-    # Collects training variables.
-    training_vars = model.trainable_variables
-
-    def _replicated_step(inputs):
-      """Replicated training step."""
-
-      inputs, labels = inputs
-      with tf.GradientTape() as tape:
-        model_outputs = model(inputs, training=True)
-        loss = loss_fn(labels, model_outputs)
-        # Raw loss is used for reporting in metrics/logs.
-        raw_loss = loss
-        if scale_loss:
-          # Scales down the loss for gradients to be invariant from replicas.
-          loss = loss / strategy.num_replicas_in_sync
-
-      if explicit_allreduce:
-        grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
-                                                     training_vars,
-                                                     pre_allreduce_callbacks,
-                                                     post_allreduce_callbacks,
-                                                     allreduce_bytes_per_pack)
-      else:
-        if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
-          with tape:
-            scaled_loss = optimizer.get_scaled_loss(loss)
-          scaled_grads = tape.gradient(scaled_loss, training_vars)
-          grads = optimizer.get_unscaled_gradients(scaled_grads)
-        else:
-          grads = tape.gradient(loss, training_vars)
-        optimizer.apply_gradients(zip(grads, training_vars))
-      # For reporting, the metric takes the mean of losses.
-      train_loss_metric.update_state(raw_loss)
-      for metric in train_metrics:
-        metric.update_state(labels, model_outputs)
-
-    @tf.function
-    def train_steps(iterator, steps):
-      """Performs distributed training steps in a loop.
-
-      Args:
-        iterator: the distributed iterator of training datasets.
-        steps: an tf.int32 integer tensor to specify number of steps to run
-          inside host training loop.
-
-      Raises:
-        ValueError: Any of the arguments or tensor shapes are invalid.
-      """
-      if not isinstance(steps, tf.Tensor):
-        raise ValueError('steps should be an Tensor. Python object may cause '
-                         'retracing.')
-
-      for _ in tf.range(steps):
-        strategy.run(_replicated_step, args=(next(iterator),))
-
-    def train_single_step(iterator):
-      """Performs a distributed training step.
-
-      Args:
-        iterator: the distributed iterator of training datasets.
-
-      Raises:
-        ValueError: Any of the arguments or tensor shapes are invalid.
-      """
-      strategy.run(_replicated_step, args=(next(iterator),))
-
-    def test_step(iterator):
-      """Calculates evaluation metrics on distributed devices."""
-
-      def _test_step_fn(inputs):
-        """Replicated accuracy calculation."""
-
-        inputs, labels = inputs
-        model_outputs = model(inputs, training=False)
-        for metric in eval_metrics:
-          metric.update_state(labels, model_outputs)
-        return model_outputs, labels
-
-      outputs, labels = strategy.run(_test_step_fn, args=(next(iterator),))
-      outputs = tf.nest.map_structure(strategy.experimental_local_results,
-                                      outputs)
-      labels = tf.nest.map_structure(strategy.experimental_local_results,
-                                     labels)
-      return outputs, labels
-
-    if not run_eagerly:
-      train_single_step = tf.function(train_single_step)
-      test_step = tf.function(test_step)
-
-    def _run_evaluation(current_training_step, test_iterator):
-      """Runs validation steps and aggregate metrics.
-
-      Args:
-        current_training_step: tf.int32 tensor containing the current step.
-        test_iterator: distributed iterator of test datasets.
-
-      Returns:
-        A dict of metic names and values.
-      """
-      # The last batch of the evaluation is often smaller than previous ones.
-      # Moreover, in some distributed pieces it might even be empty. Therefore,
-      # different from the way training_loss is calculated, it is needed to
-      # gather all the logits and labels here to calculate the evaluation loss
-      # outside.
-      loss_list, loss_weights = list(), list()
-      for _ in range(eval_steps):
-        outputs, labels = test_step(test_iterator)
-        for cur_logits, cur_labels in zip(outputs, labels):
-          # This is to handle cases when cur_labels is not a single tensor,
-          # but a dict of tensors.
-          cur_weight = tf.shape(tf.nest.flatten(cur_labels)[0])[0]
-          if cur_weight != 0:
-            loss_list.append(loss_fn(cur_labels, cur_logits).numpy())
-            loss_weights.append(cur_weight)
-      # The sample_weights are the actual number of examples in each batch,
-      # a summation of numbers of examples in each replica if using
-      # distributed training.
-      eval_loss_metric.update_state(loss_list, sample_weight=loss_weights)
-
-      logs = {}
-      with eval_summary_writer.as_default():
-        for metric in [eval_loss_metric] + eval_metrics + model.metrics:
-          metric_value = _float_metric_value(metric)
-          logs[metric.name] = metric_value
-          logging.info('Step: [%d] Validation %s = %f', current_training_step,
-                       metric.name, metric_value)
-          tf.summary.scalar(
-              metric.name, metric_value, step=current_training_step)
-        eval_summary_writer.flush()
-
-      return logs
-
-    # Training loop starts here.
-    checkpoint = tf.train.Checkpoint(
-        model=model, optimizer=optimizer, global_step=optimizer.iterations)
-    sub_model_checkpoint = tf.train.Checkpoint(
-        model=sub_model,
-        global_step=optimizer.iterations) if sub_model_export_name else None
-
-    latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
-    if latest_checkpoint_file:
-      logging.info('Checkpoint file %s found and restoring from '
-                   'checkpoint', latest_checkpoint_file)
-      checkpoint.restore(latest_checkpoint_file).expect_partial()
-      logging.info('Loading from checkpoint file completed')
-
-    current_step = optimizer.iterations.numpy()
-    checkpoint_name = 'ctl_step_{step}.ckpt'
-
-    logs = {}
-    callback_list.on_train_begin()
-    while current_step < total_training_steps and not model.stop_training:
-      if current_step % steps_per_epoch == 0:
-        callback_list.on_epoch_begin(int(current_step / steps_per_epoch) + 1)
-
-      # Training loss/metric are taking average over steps inside micro
-      # training loop. We reset the their values before each round.
-      train_loss_metric.reset_states()
-      for metric in train_metrics + model.metrics:
-        metric.reset_states()
-
-      callback_list.on_batch_begin(current_step)
-      # Runs several steps in the host while loop.
-      steps = steps_to_run(current_step, steps_between_evals, steps_per_loop)
-
-      if tf.config.list_physical_devices('GPU'):
-        # TODO(zongweiz): merge with train_steps once tf.while_loop
-        # GPU performance bugs are fixed.
-        for _ in range(steps):
-          train_single_step(train_iterator)
-      else:
-        # Converts steps to a Tensor to avoid tf.function retracing.
-        train_steps(train_iterator, tf.convert_to_tensor(steps, dtype=tf.int32))
-
-      train_loss = _float_metric_value(train_loss_metric)
-      current_step += steps
-
-      # Updates training logging.
-      training_status = 'Train Step: %d/%d  / loss = %s' % (
-          current_step, total_training_steps, train_loss)
-
-      if current_step >= last_summary_step + train_summary_interval:
-        summary_writer = train_summary_writer
-        last_summary_step = current_step
-      else:
-        summary_writer = tf.summary.create_noop_writer()
-
-      with summary_writer.as_default():
-        if callable(optimizer.learning_rate):
-          tf.summary.scalar(
-              'learning_rate',
-              optimizer.learning_rate(current_step),
-              step=current_step)
-        tf.summary.scalar(train_loss_metric.name, train_loss, step=current_step)
-        for metric in train_metrics + model.metrics:
-          metric_value = _float_metric_value(metric)
-          training_status += '  %s = %f' % (metric.name, metric_value)
-          tf.summary.scalar(metric.name, metric_value, step=current_step)
-        summary_writer.flush()
-      training_status += '\n'
-      logging.info(training_status)
-
-      # If no need for evaluation, we only call on_batch_end with train_loss,
-      # this is to ensure we get granular global_step/sec on Tensorboard.
-      if current_step % steps_between_evals:
-        callback_list.on_batch_end(current_step - 1, {'loss': train_loss})
-      else:
-        # Save a submodel with the step in the file name after each epoch.
-        if sub_model_export_name:
-          _save_checkpoint(
-              strategy, sub_model_checkpoint, model_dir,
-              '%s_step_%d.ckpt' % (sub_model_export_name, current_step))
-
-        # Save model checkpoints and run validation steps after each epoch
-        # (with the exception of the final epoch which is handled after the
-        # training loop).
-        if current_step < total_training_steps:
-          _save_checkpoint(strategy, checkpoint, model_dir,
-                           checkpoint_name.format(step=current_step))
-          if eval_input_fn:
-            # Re-initialize evaluation metric.
-            eval_loss_metric.reset_states()
-            for metric in eval_metrics + model.metrics:
-              metric.reset_states()
-
-            logging.info('Running evaluation after step: %s.', current_step)
-            logs = _run_evaluation(current_step,
-                                   _get_input_iterator(eval_input_fn, strategy))
-        # We add train_loss here rather than call on_batch_end twice to make
-        # sure that no duplicated values are generated.
-        logs['loss'] = train_loss
-        callback_list.on_batch_end(current_step - 1, logs)
-
-      # Calls on_epoch_end after each real epoch ends to prevent mis-calculation
-      # of training steps.
-      if current_step % steps_per_epoch == 0:
-        callback_list.on_epoch_end(int(current_step / steps_per_epoch), logs)
-
-    if sub_model_export_name:
-      _save_checkpoint(strategy, sub_model_checkpoint, model_dir,
-                       '%s.ckpt' % sub_model_export_name)
-
-    _save_checkpoint(strategy, checkpoint, model_dir,
-                     checkpoint_name.format(step=current_step))
-    if eval_input_fn:
-      # Re-initialize evaluation metric.
-      eval_loss_metric.reset_states()
-      for metric in eval_metrics + model.metrics:
-        metric.reset_states()
-
-      logging.info('Running final evaluation after training is complete.')
-      logs = _run_evaluation(current_step,
-                             _get_input_iterator(eval_input_fn, strategy))
-    callback_list.on_epoch_end(int(current_step / steps_per_epoch), logs)
-    training_summary = {
-        'total_training_steps': total_training_steps,
-        'train_loss': _float_metric_value(train_loss_metric),
-    }
-    for metric in model.metrics:
-      training_summary[metric.name] = _float_metric_value(metric)
-    if eval_metrics:
-      training_summary['last_train_metrics'] = _float_metric_value(
-          train_metrics[0])
-      training_summary['eval_metrics'] = _float_metric_value(eval_metrics[0])
-
-    write_txt_summary(training_summary, summary_dir)
-
-    if not _should_export_summary(strategy):
-      tf.io.gfile.rmtree(summary_dir)
-
-    callback_list.on_train_end()
-
-    return model
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_training_utils_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_training_utils_test.py
deleted file mode 100644
index 52011f0c2e21f2c3a182df4a90dc5f71f55779ef..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_training_utils_test.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.modeling.training.model_training_utils."""
-
-import os
-
-from absl import logging
-from absl.testing import flagsaver
-from absl.testing import parameterized
-from absl.testing.absltest import mock
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.nlp.bert import common_flags
-from official.nlp.bert import model_training_utils
-
-
-common_flags.define_common_bert_flags()
-
-
-def eager_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.default_strategy,
-          strategy_combinations.cloud_tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          strategy_combinations.mirrored_strategy_with_two_gpus,
-      ],)
-
-
-def eager_gpu_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.default_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          strategy_combinations.mirrored_strategy_with_two_gpus,
-      ],)
-
-
-def create_fake_data_input_fn(batch_size, features_shape, num_classes):
-  """Creates a dummy input function with the given feature and label shapes.
-
-  Args:
-    batch_size: integer.
-    features_shape: list[int]. Feature shape for an individual example.
-    num_classes: integer. Number of labels.
-
-  Returns:
-    An input function that is usable in the executor.
-  """
-
-  def _dataset_fn(input_context=None):
-    """An input function for generating fake data."""
-    local_batch_size = input_context.get_per_replica_batch_size(batch_size)
-    features = np.random.rand(64, *features_shape)
-    labels = np.random.randint(2, size=[64, num_classes])
-    # Convert the inputs to a Dataset.
-    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
-    dataset = dataset.shard(input_context.num_input_pipelines,
-                            input_context.input_pipeline_id)
-
-    def _assign_dtype(features, labels):
-      features = tf.cast(features, tf.float32)
-      labels = tf.cast(labels, tf.float32)
-      return features, labels
-
-    # Shuffle, repeat, and batch the examples.
-    dataset = dataset.map(_assign_dtype)
-    dataset = dataset.shuffle(64).repeat()
-    dataset = dataset.batch(local_batch_size, drop_remainder=True)
-    dataset = dataset.prefetch(buffer_size=64)
-    return dataset
-
-  return _dataset_fn
-
-
-def create_model_fn(input_shape, num_classes, use_float16=False):
-
-  def _model_fn():
-    """A one-layer softmax model suitable for testing."""
-    input_layer = tf.keras.layers.Input(shape=input_shape)
-    x = tf.keras.layers.Dense(num_classes, activation='relu')(input_layer)
-    output_layer = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
-    sub_model = tf.keras.models.Model(input_layer, x, name='sub_model')
-    model = tf.keras.models.Model(input_layer, output_layer, name='model')
-    model.add_metric(
-        tf.reduce_mean(input_layer), name='mean_input', aggregation='mean')
-    model.optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
-    if use_float16:
-      model.optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
-          model.optimizer)
-    return model, sub_model
-
-  return _model_fn
-
-
-def metric_fn():
-  """Gets a tf.keras metric object."""
-  return tf.keras.metrics.CategoricalAccuracy(name='accuracy', dtype=tf.float32)
-
-
-def summaries_with_matching_keyword(keyword, summary_dir):
-  """Yields summary protos matching given keyword from event file."""
-  event_paths = tf.io.gfile.glob(os.path.join(summary_dir, 'events*'))
-  for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
-    if event.summary is not None:
-      for value in event.summary.value:
-        if keyword in value.tag:
-          logging.error(event)
-          yield event.summary
-
-
-def check_eventfile_for_keyword(keyword, summary_dir):
-  """Checks event files for the keyword."""
-  return any(summaries_with_matching_keyword(keyword, summary_dir))
-
-
-class RecordingCallback(tf.keras.callbacks.Callback):
-
-  def __init__(self):
-    self.batch_begin = []  # (batch, logs)
-    self.batch_end = []  # (batch, logs)
-    self.epoch_begin = []  # (epoch, logs)
-    self.epoch_end = []  # (epoch, logs)
-
-  def on_batch_begin(self, batch, logs=None):
-    self.batch_begin.append((batch, logs))
-
-  def on_batch_end(self, batch, logs=None):
-    self.batch_end.append((batch, logs))
-
-  def on_epoch_begin(self, epoch, logs=None):
-    self.epoch_begin.append((epoch, logs))
-
-  def on_epoch_end(self, epoch, logs=None):
-    self.epoch_end.append((epoch, logs))
-
-
-class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(ModelTrainingUtilsTest, self).setUp()
-    self._model_fn = create_model_fn(input_shape=[128], num_classes=3)
-
-  @flagsaver.flagsaver
-  def run_training(self, strategy, model_dir, steps_per_loop, run_eagerly):
-    input_fn = create_fake_data_input_fn(
-        batch_size=8, features_shape=[128], num_classes=3)
-    model_training_utils.run_customized_training_loop(
-        strategy=strategy,
-        model_fn=self._model_fn,
-        loss_fn=tf.keras.losses.categorical_crossentropy,
-        model_dir=model_dir,
-        steps_per_epoch=20,
-        steps_per_loop=steps_per_loop,
-        epochs=2,
-        train_input_fn=input_fn,
-        eval_input_fn=input_fn,
-        eval_steps=10,
-        init_checkpoint=None,
-        sub_model_export_name='my_submodel_name',
-        metric_fn=metric_fn,
-        custom_callbacks=None,
-        run_eagerly=run_eagerly)
-
-  @combinations.generate(eager_strategy_combinations())
-  def test_train_eager_single_step(self, distribution):
-    model_dir = self.create_tempdir().full_path
-    if isinstance(
-        distribution,
-        (tf.distribute.TPUStrategy, tf.distribute.experimental.TPUStrategy)):
-      with self.assertRaises(ValueError):
-        self.run_training(
-            distribution, model_dir, steps_per_loop=1, run_eagerly=True)
-    else:
-      self.run_training(
-          distribution, model_dir, steps_per_loop=1, run_eagerly=True)
-
-  @combinations.generate(eager_gpu_strategy_combinations())
-  def test_train_eager_mixed_precision(self, distribution):
-    model_dir = self.create_tempdir().full_path
-    tf.keras.mixed_precision.set_global_policy('mixed_float16')
-    self._model_fn = create_model_fn(
-        input_shape=[128], num_classes=3, use_float16=True)
-    self.run_training(
-        distribution, model_dir, steps_per_loop=1, run_eagerly=True)
-
-  @combinations.generate(eager_strategy_combinations())
-  def test_train_check_artifacts(self, distribution):
-    model_dir = self.create_tempdir().full_path
-    self.run_training(
-        distribution, model_dir, steps_per_loop=10, run_eagerly=False)
-
-    # Two checkpoints should be saved after two epochs.
-    files = map(os.path.basename,
-                tf.io.gfile.glob(os.path.join(model_dir, 'ctl_step_*index')))
-    self.assertCountEqual(
-        ['ctl_step_20.ckpt-1.index', 'ctl_step_40.ckpt-2.index'], files)
-
-    # Three submodel checkpoints should be saved after two epochs (one after
-    # each epoch plus one final).
-    files = map(
-        os.path.basename,
-        tf.io.gfile.glob(os.path.join(model_dir, 'my_submodel_name*index')))
-    self.assertCountEqual([
-        'my_submodel_name.ckpt-3.index',
-        'my_submodel_name_step_20.ckpt-1.index',
-        'my_submodel_name_step_40.ckpt-2.index'
-    ], files)
-
-    self.assertNotEmpty(
-        tf.io.gfile.glob(
-            os.path.join(model_dir, 'summaries/training_summary*')))
-
-    # Loss and accuracy values should be written into summaries.
-    self.assertTrue(
-        check_eventfile_for_keyword('loss',
-                                    os.path.join(model_dir, 'summaries/train')))
-    self.assertTrue(
-        check_eventfile_for_keyword('accuracy',
-                                    os.path.join(model_dir, 'summaries/train')))
-    self.assertTrue(
-        check_eventfile_for_keyword('mean_input',
-                                    os.path.join(model_dir, 'summaries/train')))
-    self.assertTrue(
-        check_eventfile_for_keyword('accuracy',
-                                    os.path.join(model_dir, 'summaries/eval')))
-    self.assertTrue(
-        check_eventfile_for_keyword('mean_input',
-                                    os.path.join(model_dir, 'summaries/eval')))
-
-  @combinations.generate(eager_strategy_combinations())
-  def test_train_check_callbacks(self, distribution):
-    model_dir = self.create_tempdir().full_path
-    callback = RecordingCallback()
-    callbacks = [callback]
-    input_fn = create_fake_data_input_fn(
-        batch_size=8, features_shape=[128], num_classes=3)
-    model_training_utils.run_customized_training_loop(
-        strategy=distribution,
-        model_fn=self._model_fn,
-        loss_fn=tf.keras.losses.categorical_crossentropy,
-        model_dir=model_dir,
-        steps_per_epoch=20,
-        num_eval_per_epoch=4,
-        steps_per_loop=10,
-        epochs=2,
-        train_input_fn=input_fn,
-        eval_input_fn=input_fn,
-        eval_steps=10,
-        init_checkpoint=None,
-        metric_fn=metric_fn,
-        custom_callbacks=callbacks,
-        run_eagerly=False)
-    self.assertEqual(callback.epoch_begin, [(1, {}), (2, {})])
-    epoch_ends, epoch_end_infos = zip(*callback.epoch_end)
-    self.assertEqual(list(epoch_ends), [1, 2, 2])
-    for info in epoch_end_infos:
-      self.assertIn('accuracy', info)
-
-    self.assertEqual(callback.batch_begin, [(0, {}), (5, {}), (10, {}),
-                                            (15, {}), (20, {}), (25, {}),
-                                            (30, {}), (35, {})])
-    batch_ends, batch_end_infos = zip(*callback.batch_end)
-    self.assertEqual(list(batch_ends), [4, 9, 14, 19, 24, 29, 34, 39])
-    for info in batch_end_infos:
-      self.assertIn('loss', info)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.one_device_strategy_gpu,
-          ],))
-  def test_train_check_artifacts_non_chief(self, distribution):
-    # We shouldn't export artifacts on non-chief workers. Since there's no easy
-    # way to test with real MultiWorkerMirroredStrategy, we patch the strategy
-    # to make it as if it's MultiWorkerMirroredStrategy on non-chief workers.
-    extended = distribution.extended
-    with mock.patch.object(extended.__class__, 'should_checkpoint',
-                           new_callable=mock.PropertyMock, return_value=False), \
-         mock.patch.object(extended.__class__, 'should_save_summary',
-                           new_callable=mock.PropertyMock, return_value=False):
-      model_dir = self.create_tempdir().full_path
-      self.run_training(
-          distribution, model_dir, steps_per_loop=10, run_eagerly=False)
-      self.assertEmpty(tf.io.gfile.listdir(model_dir))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/npu_convert_dropout.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/npu_convert_dropout.py
deleted file mode 100644
index 95f8689ce4da26c08f18a0fcb49c42eb7f1c8b06..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/npu_convert_dropout.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python3
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from keras import backend
-from keras.utils import control_flow_util
-from keras.layers.core import Dropout
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn
-import npu_ops
-
-def dropout_call(self, inputs, training=None):
-    """Make Keras Dropout to execute NPU dropout"""
-    if training is None:
-        training = backend.learning_phase()
-
-    def dropped_inputs():
-        return npu_ops.dropout(
-            inputs,
-            noise_shape=self._get_noise_shape(inputs),
-            seed=self.seed,
-            keep_prob=1 - self.rate)
-
-    output = control_flow_util.smart_cond(training,
-                                          dropped_inputs,
-                                          lambda : array_ops.identity(inputs))
-
-    return output
-
-Dropout.call = dropout_call
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/npu_ops.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/npu_ops.py
deleted file mode 100644
index 9de214f8b8eb307ef743bdb29dc0488a61ce32a1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/npu_ops.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-
-"""Ops for collective operations implemented using hccl."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-import numbers
-from tensorflow.python.ops import array_ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import ops
-from tensorflow.python.eager import context
-
-from npu_device import gen_npu_ops
-
-
-DEFAULT_GRAPH_SEED = 87654321
-_MAXINT32 = 2**31 - 1
-def LARSV2(input_weight,
-           input_grad,
-           weight_decay,
-           learning_rate,
-           hyperpara=0.001,
-           epsilon=0.00001,
-           use_clip=False,
-           name=None):
-    if context.executing_eagerly():
-        raise RuntimeError("tf.LARSV2() is not compatible with "
-                           "eager execution.")
-
-    return gen_npu_ops.lars_v2(input_weight=input_weight,
-                               input_grad=input_grad,
-                               weight_decay=weight_decay,
-                               learning_rate=learning_rate,
-                               hyperpara=hyperpara,
-                               epsilon=epsilon,
-                               use_clip=use_clip,
-                               name=name)
-
-
-def _truncate_seed(seed):
-  return seed % _MAXINT32  # Truncate to fit into 32-bit integer
-
-def get_seed(op_seed):
-  global_seed = ops.get_default_graph().seed
-
-  if global_seed is not None:
-    if op_seed is None:
-      op_seed = ops.get_default_graph()._last_id
-
-    seeds = _truncate_seed(global_seed), _truncate_seed(op_seed)
-  else:
-    if op_seed is not None:
-      seeds = DEFAULT_GRAPH_SEED, _truncate_seed(op_seed)
-    else:
-      seeds = None, None
-  # Avoid (0, 0) as the C++ ops interpret it as nondeterminism, which would
-  # be unexpected since Python docs say nondeterminism is (None, None).
-  if seeds == (0, 0):
-    return (0, _MAXINT32)
-  return seeds
-
-def _get_noise_shape(x, noise_shape):
-  # If noise_shape is none return immediately.
-  if noise_shape is None:
-    return array_ops.shape(x)
-
-  try:
-    # Best effort to figure out the intended shape.
-    # If not possible, let the op to handle it.
-    # In eager mode exception will show up.
-    noise_shape_ = tensor_shape.as_shape(noise_shape)
-  except (TypeError, ValueError):
-    return noise_shape
-
-  if x.shape.dims is not None and len(x.shape.dims) == len(noise_shape_.dims):
-    new_dims = []
-    for i, dim in enumerate(x.shape.dims):
-      if noise_shape_.dims[i].value is None and dim.value is not None:
-        new_dims.append(dim.value)
-      else:
-        new_dims.append(noise_shape_.dims[i].value)
-    return tensor_shape.TensorShape(new_dims)
-
-  return noise_shape
-
-def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):
-    """The gradient for `gelu`.
-
-    Args:
-        x: A tensor with type is float.
-        keep_prob: A tensor, float, rate of every element reserved.
-        noise_shape: A 1-D tensor, with type int32, shape of keep/drop what random
-            generated.
-        seed: Random seed.
-        name: Layer name.
-
-    Returns:
-        A tensor.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError("tf.dropout() is not compatible with "
-                        "eager execution.")
-    x = ops.convert_to_tensor(x, name="x")
-    if not x.dtype.is_floating:
-      raise ValueError("x has to be a floating point tensor since it's going to"
-                       " be scaled. Got a %s tensor instead." % x.dtype)
-    if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
-      raise ValueError("keep_prob must be a scalar tensor or a float in the "
-                       "range (0, 1], got %g" % keep_prob)
-    if isinstance(keep_prob, float) and keep_prob == 1:
-      return x
-    seed, seed2 = get_seed(seed)
-    noise_shape = _get_noise_shape(x, noise_shape)
-    gen_out = gen_npu_ops.drop_out_gen_mask(noise_shape, keep_prob, seed, seed2, name)
-    result = gen_npu_ops.drop_out_do_mask(x, gen_out, keep_prob, name)
-    return result
-
-#@ops.RegisterGradient("DropOutDoMask")
-def _DropOutDoMaskGrad(op, grad):
-    result = gen_npu_ops.drop_out_do_mask(grad, op.inputs[1],  op.inputs[2])
-    return [result, None, None]
-
-grad_registry_list = ops.gradient_registry.list()
-if "DropOutDoMask" not in grad_registry_list:
-    ops.RegisterGradient("DropOutDoMask")(_DropOutDoMaskGrad)
-
-def basic_lstm_cell(x, h, c, w, b, keep_prob, forget_bias, state_is_tuple,
-                    activation, name=None):
-    if context.executing_eagerly():
-      raise RuntimeError("tf.basic_lstm_cell() is not compatible with "
-                        "eager execution.")
-    x = ops.convert_to_tensor(x, name="x")
-    h = ops.convert_to_tensor(h, name="h")
-    c = ops.convert_to_tensor(c, name="c")
-    w = ops.convert_to_tensor(w, name="w")
-    b = ops.convert_to_tensor(b, name="b")
-    result = gen_npu_ops.basic_lstm_cell(x, h, c, w, b, keep_prob, forget_bias, state_is_tuple,
-                    activation, name)
-    return result
-
-@ops.RegisterGradient("BasicLSTMCell")
-def basic_lstm_cell_grad(op, dct, dht, dit, djt, dft, dot, dtanhct):
-
-    dgate, dct_1 = gen_npu_ops.basic_lstm_cell_c_state_grad(op.inputs[2], dht, dct, op.outputs[2], op.outputs[3], op.outputs[4], op.outputs[5], op.outputs[6], forget_bias=op.get_attr("forget_bias"), activation=op.get_attr("activation"))
-    dw, db = gen_npu_ops.basic_lstm_cell_weight_grad(op.inputs[0], op.inputs[1], dgate)
-    dxt, dht = gen_npu_ops.basic_lstm_cell_input_grad(dgate, op.inputs[3], keep_prob=op.get_attr("keep_prob"))
-
-    return [dxt, dht, dct_1, dw, db]
-
-def adam_apply_one_assign(input0, input1, input2, input3, input4,
-                   mul0_x, mul1_x, mul2_x, mul3_x, add2_y, name=None):
-    if context.executing_eagerly():
-      raise RuntimeError("tf.adam_apply_one_assign() is not compatible with "
-                        "eager execution.")
-    result = gen_npu_ops.adam_apply_one_assign(input0, input1, input2, input3, input4,
-                   mul0_x, mul1_x, mul2_x, mul3_x, add2_y,name)
-    return result
-
-def adam_apply_one_with_decay_assign(input0, input1, input2, input3, input4,
-                   mul0_x, mul1_x, mul2_x, mul3_x, mul4_x, add2_y, name=None):
-    if context.executing_eagerly():
-      raise RuntimeError("tf.adam_apply_one_with_decay_assign() is not compatible with "
-                        "eager execution.")
-    result = gen_npu_ops.adam_apply_one_with_decay_assign(input0, input1, input2, input3, input4,
-                   mul0_x, mul1_x, mul2_x, mul3_x, mul4_x, add2_y, name)
-    return result
-
-@ops.RegisterGradient("DynamicGruV2")
-def dynamic_gru_v2_grad(op, dy, doutput_h, dupdate, dreset, dnew, dhidden_new):
-    (x, weight_input, weight_hidden, bias_input, bias_hidden, seq_length, init_h) = op.inputs
-    (y, output_h, update, reset, new, hidden_new) = op.outputs
-    (dw_input, dw_hidden, db_input, db_hidden, dx, dh_prev) = gen_npu_ops.dynamic_gru_v2_grad(x, weight_input, weight_hidden, y, init_h, output_h, dy, doutput_h, update, reset, new, hidden_new, direction=op.get_attr("direction"), cell_depth=op.get_attr("cell_depth"), keep_prob=op.get_attr("keep_prob"), cell_clip=op.get_attr("cell_clip"), num_proj=op.get_attr("num_proj"), time_major=op.get_attr("time_major"), gate_order=op.get_attr("gate_order"), reset_after=op.get_attr("reset_after"))
-
-    return (dx, dw_input, dw_hidden, db_input, db_hidden, seq_length, dh_prev)
-
-@ops.RegisterGradient("DynamicRnn")
-def dynamic_rnn_grad(op, dy, dh, dc, di, dj, df, do, dtanhc):
-    (x, w, b, seq_length, init_h, init_c) = op.inputs
-    (y, output_h, output_c, i, j, f, o, tanhc) = op.outputs
-    (dw, db, dx, dh_prev, dc_prev) = gen_npu_ops.dynamic_rnn_grad(x, w, b, y, init_h[-1], init_c[-1], output_h, output_c, dy, dh[-1], dc[-1], i, j, f, o, tanhc, cell_type=op.get_attr("cell_type"), direction=op.get_attr("direction"), cell_depth=op.get_attr("cell_depth"), use_peephole=op.get_attr("use_peephole"), keep_prob=op.get_attr("keep_prob"), cell_clip=op.get_attr("cell_clip"), num_proj=op.get_attr("num_proj"), time_major=op.get_attr("time_major"), forget_bias=op.get_attr("forget_bias"))
-
-    return (dx, dw, db, seq_length, dh_prev, dc_prev)
-
-def lamb_apply_optimizer_assign(input0,input1,input2,input3,mul0_x,mul1_x,mul2_x,
-                                mul3_x,add2_y,steps,do_use_weight,weight_decay_rate,name=None):
-    if context.executing_eagerly():
-      raise RuntimeError("tf.lamb_apply_optimizer_assign() is not compatible with eager execution")
-    update,nextv,nextm=gen_npu_ops.lamb_apply_optimizer_assign(input0,input1,input2,input3,mul0_x,mul1_x,mul2_x,
-                                mul3_x,add2_y,steps,do_use_weight,weight_decay_rate,name)
-    return update,nextv,nextm
-
-def lamb_apply_weight_assign(input0,input1,input2,input3,input4,name=None):
-    if context.executing_eagerly():
-      raise RuntimeError("tf.lamb_apply_weight_assign() is not compatible with eager execution")
-    result = gen_npu_ops.lamb_apply_weight_assign(input0,input1,input2,input3,input4,name)
-    return result
-
-def dropout_v3(x, keep_prob, noise_shape=None, seed=None, name=None):
-  """ The gradient for gelu
-
-  Args:
-    x: A tensor with type is float
-    keep_prob: A tensor, float, rate of every element reserved
-    noise_shape: A 1-D tensor, with type int32, shape of keep/drop what random generated.
-    seed: Random seed.
-    name: Layer name.
-  
-  Returns:
-    A tensor.
-  """
-  x = ops.convert_to_tensor(x,name="x")
-  if not x.dtype.is_floating:
-    raise ValueError("x has to be a floating point tensor since it's going to be scaled. Got a %s tensor instead." % x.dtype)
-  
-  if isinstance(keep_prob,numbers.Real) and not 0 < keep_prob <=1:
-    raise ValueError("Keep_prob must be a scalar tensor or a float in the range (0,1], got %g" % keep_prob)
-  
-  if isinstance(keep_prob,float) and keep_prob==1:
-    return x
-  
-  seed, seed2 = get_seed(seed)
-  noise_shape = _get_noise_shape(x,noise_shape)
-  gen_out = gen_npu_ops.drop_out_gen_mask_v3(noise_shape,keep_prob,seed,seed2,name)
-  result = gen_npu_ops.drop_out_do_mask_v3(x, gen_out, keep_prob, name)
-  return result
-
-@ops.RegisterGradient("DropOutDoMaskV3")
-def _DropOutDoMaskV3Grad(op,grad):
-  result = gen_npu_ops.drop_out_do_mask_v3(grad, op.inputs[1], op.inputs[2])
-  return [result, None, None]
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_classifier.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_classifier.py
deleted file mode 100644
index 1796ddca9cd8778ee5d1001d49922022849163a2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_classifier.py
+++ /dev/null
@@ -1,532 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""BERT classification or regression finetuning runner in TF 2.x."""
-
-import functools
-import json
-import math
-import os
-
-# Import libraries
-from absl import app
-from absl import flags
-from absl import logging
-import gin
-import tensorflow as tf
-from official.common import distribute_utils
-from official.modeling import performance
-from official.nlp import optimization
-from official.nlp.bert import bert_models
-from official.nlp.bert import common_flags
-from official.nlp.bert import configs as bert_configs
-from official.nlp.bert import input_pipeline
-from official.nlp.bert import model_saving_utils
-from official.utils.misc import keras_utils
-
-flags.DEFINE_enum(
-    'mode', 'train_and_eval', ['train_and_eval', 'export_only', 'predict'],
-    'One of {"train_and_eval", "export_only", "predict"}. `train_and_eval`: '
-    'trains the model and evaluates in the meantime. '
-    '`export_only`: will take the latest checkpoint inside '
-    'model_dir and export a `SavedModel`. `predict`: takes a checkpoint and '
-    'restores the model to output predictions on the test set.')
-flags.DEFINE_string('train_data_path', None,
-                    'Path to training data for BERT classifier.')
-flags.DEFINE_string('eval_data_path', None,
-                    'Path to evaluation data for BERT classifier.')
-flags.DEFINE_string(
-    'input_meta_data_path', None,
-    'Path to file that contains meta data about input '
-    'to be used for training and evaluation.')
-flags.DEFINE_integer('train_data_size', None, 'Number of training samples '
-                     'to use. If None, uses the full train data. '
-                     '(default: None).')
-flags.DEFINE_string('predict_checkpoint_path', None,
-                    'Path to the checkpoint for predictions.')
-flags.DEFINE_integer(
-    'num_eval_per_epoch', 1,
-    'Number of evaluations per epoch. The purpose of this flag is to provide '
-    'more granular evaluation scores and checkpoints. For example, if original '
-    'data has N samples and num_eval_per_epoch is n, then each epoch will be '
-    'evaluated every N/n samples.')
-flags.DEFINE_integer('train_batch_size', 32, 'Batch size for training.')
-flags.DEFINE_integer('eval_batch_size', 32, 'Batch size for evaluation.')
-
-common_flags.define_common_bert_flags()
-
-FLAGS = flags.FLAGS
-
-LABEL_TYPES_MAP = {'int': tf.int64, 'float': tf.float32}
-
-
-def get_loss_fn(num_classes):
-  """Gets the classification loss function."""
-
-  def classification_loss_fn(labels, logits):
-    """Classification loss."""
-    labels = tf.squeeze(labels)
-    log_probs = tf.nn.log_softmax(logits, axis=-1)
-    one_hot_labels = tf.one_hot(
-        tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32)
-    per_example_loss = -tf.reduce_sum(
-        tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1)
-    return tf.reduce_mean(per_example_loss)
-
-  return classification_loss_fn
-
-
-def get_dataset_fn(input_file_pattern,
-                   max_seq_length,
-                   global_batch_size,
-                   is_training,
-                   label_type=tf.int64,
-                   include_sample_weights=False,
-                   num_samples=None):
-  """Gets a closure to create a dataset."""
-
-  def _dataset_fn(ctx=None):
-    """Returns tf.data.Dataset for distributed BERT pretraining."""
-    batch_size = ctx.get_per_replica_batch_size(
-        global_batch_size) if ctx else global_batch_size
-    dataset = input_pipeline.create_classifier_dataset(
-        tf.io.gfile.glob(input_file_pattern),
-        max_seq_length,
-        batch_size,
-        is_training=is_training,
-        input_pipeline_context=ctx,
-        label_type=label_type,
-        include_sample_weights=include_sample_weights,
-        num_samples=num_samples)
-    return dataset
-
-  return _dataset_fn
-
-
-def run_bert_classifier(strategy,
-                        bert_config,
-                        input_meta_data,
-                        model_dir,
-                        epochs,
-                        steps_per_epoch,
-                        steps_per_loop,
-                        eval_steps,
-                        warmup_steps,
-                        initial_lr,
-                        init_checkpoint,
-                        train_input_fn,
-                        eval_input_fn,
-                        training_callbacks=True,
-                        custom_callbacks=None,
-                        custom_metrics=None):
-  """Run BERT classifier training using low-level API."""
-  max_seq_length = input_meta_data['max_seq_length']
-  num_classes = input_meta_data.get('num_labels', 1)
-  is_regression = num_classes == 1
-
-  def _get_classifier_model():
-    """Gets a classifier model."""
-    classifier_model, core_model = (
-        bert_models.classifier_model(
-            bert_config,
-            num_classes,
-            max_seq_length,
-            hub_module_url=FLAGS.hub_module_url,
-            hub_module_trainable=FLAGS.hub_module_trainable))
-    optimizer = optimization.create_optimizer(initial_lr,
-                                              steps_per_epoch * epochs,
-                                              warmup_steps, FLAGS.end_lr,
-                                              FLAGS.optimizer_type)
-    classifier_model.optimizer = performance.configure_optimizer(
-        optimizer,
-        use_float16=common_flags.use_float16(),
-        use_graph_rewrite=common_flags.use_graph_rewrite())
-    return classifier_model, core_model
-
-  # tf.keras.losses objects accept optional sample_weight arguments (eg. coming
-  # from the dataset) to compute weighted loss, as used for the regression
-  # tasks. The classification tasks, using the custom get_loss_fn don't accept
-  # sample weights though.
-  loss_fn = (tf.keras.losses.MeanSquaredError() if is_regression
-             else get_loss_fn(num_classes))
-
-  # Defines evaluation metrics function, which will create metrics in the
-  # correct device and strategy scope.
-  if custom_metrics:
-    metric_fn = custom_metrics
-  elif is_regression:
-    metric_fn = functools.partial(
-        tf.keras.metrics.MeanSquaredError,
-        'mean_squared_error',
-        dtype=tf.float32)
-  else:
-    metric_fn = functools.partial(
-        tf.keras.metrics.SparseCategoricalAccuracy,
-        'accuracy',
-        dtype=tf.float32)
-
-  # Start training using Keras compile/fit API.
-  logging.info('Training using TF 2.x Keras compile/fit API with '
-               'distribution strategy.')
-  return run_keras_compile_fit(
-      model_dir,
-      strategy,
-      _get_classifier_model,
-      train_input_fn,
-      eval_input_fn,
-      loss_fn,
-      metric_fn,
-      init_checkpoint,
-      epochs,
-      steps_per_epoch,
-      steps_per_loop,
-      eval_steps,
-      training_callbacks=training_callbacks,
-      custom_callbacks=custom_callbacks)
-
-
-def run_keras_compile_fit(model_dir,
-                          strategy,
-                          model_fn,
-                          train_input_fn,
-                          eval_input_fn,
-                          loss_fn,
-                          metric_fn,
-                          init_checkpoint,
-                          epochs,
-                          steps_per_epoch,
-                          steps_per_loop,
-                          eval_steps,
-                          training_callbacks=True,
-                          custom_callbacks=None):
-  """Runs BERT classifier model using Keras compile/fit API."""
-
-  with strategy.scope():
-    training_dataset = train_input_fn()
-    evaluation_dataset = eval_input_fn() if eval_input_fn else None
-    bert_model, sub_model = model_fn()
-    optimizer = bert_model.optimizer
-
-    if init_checkpoint:
-      checkpoint = tf.train.Checkpoint(model=sub_model, encoder=sub_model)
-      checkpoint.restore(init_checkpoint).expect_partial().assert_existing_objects_matched()
-
-    if not isinstance(metric_fn, (list, tuple)):
-      metric_fn = [metric_fn]
-    bert_model.compile(
-        optimizer=optimizer,
-        loss=loss_fn,
-        metrics=[fn() for fn in metric_fn],
-        steps_per_execution=steps_per_loop)
-
-    summary_dir = os.path.join(model_dir, 'summaries')
-    summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
-    checkpoint = tf.train.Checkpoint(model=bert_model, optimizer=optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        directory=model_dir,
-        max_to_keep=None,
-        step_counter=optimizer.iterations,
-        checkpoint_interval=0)
-    checkpoint_callback = keras_utils.SimpleCheckpoint(checkpoint_manager)
-
-    if training_callbacks:
-      if custom_callbacks is not None:
-        custom_callbacks += [summary_callback, checkpoint_callback]
-      else:
-        custom_callbacks = [summary_callback, checkpoint_callback]
-
-    history = bert_model.fit(
-        x=training_dataset,
-        validation_data=evaluation_dataset,
-        steps_per_epoch=steps_per_epoch,
-        epochs=epochs,
-        validation_steps=eval_steps,
-        callbacks=custom_callbacks)
-    stats = {'total_training_steps': steps_per_epoch * epochs}
-    if 'loss' in history.history:
-      stats['train_loss'] = history.history['loss'][-1]
-    if 'val_accuracy' in history.history:
-      stats['eval_metrics'] = history.history['val_accuracy'][-1]
-    return bert_model, stats
-
-
-def get_predictions_and_labels(strategy,
-                               trained_model,
-                               eval_input_fn,
-                               is_regression=False,
-                               return_probs=False):
-  """Obtains predictions of trained model on evaluation data.
-
-  Note that list of labels is returned along with the predictions because the
-  order changes on distributing dataset over TPU pods.
-
-  Args:
-    strategy: Distribution strategy.
-    trained_model: Trained model with preloaded weights.
-    eval_input_fn: Input function for evaluation data.
-    is_regression: Whether it is a regression task.
-    return_probs: Whether to return probabilities of classes.
-
-  Returns:
-    predictions: List of predictions.
-    labels: List of gold labels corresponding to predictions.
-  """
-
-  @tf.function
-  def test_step(iterator):
-    """Computes predictions on distributed devices."""
-
-    def _test_step_fn(inputs):
-      """Replicated predictions."""
-      inputs, labels = inputs
-      logits = trained_model(inputs, training=False)
-      if not is_regression:
-        probabilities = tf.nn.softmax(logits)
-        return probabilities, labels
-      else:
-        return logits, labels
-
-    outputs, labels = strategy.run(_test_step_fn, args=(next(iterator),))
-    # outputs: current batch logits as a tuple of shard logits
-    outputs = tf.nest.map_structure(strategy.experimental_local_results,
-                                    outputs)
-    labels = tf.nest.map_structure(strategy.experimental_local_results, labels)
-    return outputs, labels
-
-  def _run_evaluation(test_iterator):
-    """Runs evaluation steps."""
-    preds, golds = list(), list()
-    try:
-      with tf.experimental.async_scope():
-        while True:
-          probabilities, labels = test_step(test_iterator)
-          for cur_probs, cur_labels in zip(probabilities, labels):
-            if return_probs:
-              preds.extend(cur_probs.numpy().tolist())
-            else:
-              preds.extend(tf.math.argmax(cur_probs, axis=1).numpy())
-            golds.extend(cur_labels.numpy().tolist())
-    except (StopIteration, tf.errors.OutOfRangeError):
-      tf.experimental.async_clear_error()
-    return preds, golds
-
-  test_iter = iter(strategy.distribute_datasets_from_function(eval_input_fn))
-  predictions, labels = _run_evaluation(test_iter)
-
-  return predictions, labels
-
-
-def export_classifier(model_export_path, input_meta_data, bert_config,
-                      model_dir):
-  """Exports a trained model as a `SavedModel` for inference.
-
-  Args:
-    model_export_path: a string specifying the path to the SavedModel directory.
-    input_meta_data: dictionary containing meta data about input and model.
-    bert_config: Bert configuration file to define core bert layers.
-    model_dir: The directory where the model weights and training/evaluation
-      summaries are stored.
-
-  Raises:
-    Export path is not specified, got an empty string or None.
-  """
-  if not model_export_path:
-    raise ValueError('Export path is not specified: %s' % model_export_path)
-  if not model_dir:
-    raise ValueError('Export path is not specified: %s' % model_dir)
-
-  # Export uses float32 for now, even if training uses mixed precision.
-  tf.keras.mixed_precision.set_global_policy('float32')
-  classifier_model = bert_models.classifier_model(
-      bert_config,
-      input_meta_data.get('num_labels', 1),
-      hub_module_url=FLAGS.hub_module_url,
-      hub_module_trainable=False)[0]
-
-  model_saving_utils.export_bert_model(
-      model_export_path, model=classifier_model, checkpoint_dir=model_dir)
-
-
-def run_bert(strategy,
-             input_meta_data,
-             model_config,
-             train_input_fn=None,
-             eval_input_fn=None,
-             init_checkpoint=None,
-             custom_callbacks=None,
-             custom_metrics=None):
-  """Run BERT training."""
-  # Enables XLA in Session Config. Should not be set for TPU.
-  keras_utils.set_session_config(FLAGS.enable_xla)
-  performance.set_mixed_precision_policy(common_flags.dtype())
-
-  epochs = FLAGS.num_train_epochs * FLAGS.num_eval_per_epoch
-  train_data_size = (
-      input_meta_data['train_data_size'] // FLAGS.num_eval_per_epoch)
-  if FLAGS.train_data_size:
-    train_data_size = min(train_data_size, FLAGS.train_data_size)
-    logging.info('Updated train_data_size: %s', train_data_size)
-  steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
-  warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size)
-  eval_steps = int(
-      math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))
-
-  if not strategy:
-    raise ValueError('Distribution strategy has not been specified.')
-
-  if not custom_callbacks:
-    custom_callbacks = []
-
-  if FLAGS.log_steps:
-    custom_callbacks.append(
-        keras_utils.TimeHistory(
-            batch_size=FLAGS.train_batch_size,
-            log_steps=FLAGS.log_steps,
-            logdir=FLAGS.model_dir))
-
-  trained_model, _ = run_bert_classifier(
-      strategy,
-      model_config,
-      input_meta_data,
-      FLAGS.model_dir,
-      epochs,
-      steps_per_epoch,
-      FLAGS.steps_per_loop,
-      eval_steps,
-      warmup_steps,
-      FLAGS.learning_rate,
-      init_checkpoint or FLAGS.init_checkpoint,
-      train_input_fn,
-      eval_input_fn,
-      custom_callbacks=custom_callbacks,
-      custom_metrics=custom_metrics)
-
-  if FLAGS.model_export_path:
-    model_saving_utils.export_bert_model(
-        FLAGS.model_export_path, model=trained_model)
-  return trained_model
-
-
-def custom_main(custom_callbacks=None, custom_metrics=None):
-  """Run classification or regression.
-
-  Args:
-    custom_callbacks: list of tf.keras.Callbacks passed to training loop.
-    custom_metrics: list of metrics passed to the training loop.
-  """
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
-
-  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-    input_meta_data = json.loads(reader.read().decode('utf-8'))
-  label_type = LABEL_TYPES_MAP[input_meta_data.get('label_type', 'int')]
-  include_sample_weights = input_meta_data.get('has_sample_weights', False)
-
-  if not FLAGS.model_dir:
-    FLAGS.model_dir = '/tmp/bert20/'
-
-  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-
-  if FLAGS.mode == 'export_only':
-    export_classifier(FLAGS.model_export_path, input_meta_data, bert_config,
-                      FLAGS.model_dir)
-    return
-
-  strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=FLAGS.distribution_strategy,
-      num_gpus=FLAGS.num_gpus,
-      tpu_address=FLAGS.tpu)
-  eval_input_fn = get_dataset_fn(
-      FLAGS.eval_data_path,
-      input_meta_data['max_seq_length'],
-      FLAGS.eval_batch_size,
-      is_training=False,
-      label_type=label_type,
-      include_sample_weights=include_sample_weights)
-
-  if FLAGS.mode == 'predict':
-    num_labels = input_meta_data.get('num_labels', 1)
-    with strategy.scope():
-      classifier_model = bert_models.classifier_model(
-          bert_config, num_labels)[0]
-      checkpoint = tf.train.Checkpoint(model=classifier_model)
-      latest_checkpoint_file = (
-          FLAGS.predict_checkpoint_path or
-          tf.train.latest_checkpoint(FLAGS.model_dir))
-      assert latest_checkpoint_file
-      logging.info('Checkpoint file %s found and restoring from '
-                   'checkpoint', latest_checkpoint_file)
-      checkpoint.restore(
-          latest_checkpoint_file).assert_existing_objects_matched()
-      preds, _ = get_predictions_and_labels(
-          strategy,
-          classifier_model,
-          eval_input_fn,
-          is_regression=(num_labels == 1),
-          return_probs=True)
-    output_predict_file = os.path.join(FLAGS.model_dir, 'test_results.tsv')
-    with tf.io.gfile.GFile(output_predict_file, 'w') as writer:
-      logging.info('***** Predict results *****')
-      for probabilities in preds:
-        output_line = '\t'.join(
-            str(class_probability)
-            for class_probability in probabilities) + '\n'
-        writer.write(output_line)
-    return
-
-  if FLAGS.mode != 'train_and_eval':
-    raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
-  train_input_fn = get_dataset_fn(
-      FLAGS.train_data_path,
-      input_meta_data['max_seq_length'],
-      FLAGS.train_batch_size,
-      is_training=True,
-      label_type=label_type,
-      include_sample_weights=include_sample_weights,
-      num_samples=FLAGS.train_data_size)
-  run_bert(
-      strategy,
-      input_meta_data,
-      bert_config,
-      train_input_fn,
-      eval_input_fn,
-      custom_callbacks=custom_callbacks,
-      custom_metrics=custom_metrics)
-
-
-def main(_):
-  custom_main(custom_callbacks=None, custom_metrics=None)
-
-
-if __name__ == '__main__':
-  flags.mark_flag_as_required('bert_config_file')
-  flags.mark_flag_as_required('input_meta_data_path')
-  flags.mark_flag_as_required('model_dir')
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_pretraining.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_pretraining.py
deleted file mode 100644
index 864964a301d83cdd52d6a6ecd7a9c57d07190dde..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_pretraining.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Run masked LM/next sentence pre-training for BERT in TF 2.x."""
-
-# Import libraries
-from absl import app
-from absl import flags
-from absl import logging
-import gin
-import tensorflow as tf
-from official.common import distribute_utils
-from official.modeling import performance
-from official.nlp import optimization
-from official.nlp.bert import bert_models
-from official.nlp.bert import common_flags
-from official.nlp.bert import configs
-from official.nlp.bert import input_pipeline
-from official.nlp.bert import model_training_utils
-
-
-flags.DEFINE_string('input_files', None,
-                    'File path to retrieve training data for pre-training.')
-# Model training specific flags.
-flags.DEFINE_integer(
-    'max_seq_length', 128,
-    'The maximum total input sequence length after WordPiece tokenization. '
-    'Sequences longer than this will be truncated, and sequences shorter '
-    'than this will be padded.')
-flags.DEFINE_integer('max_predictions_per_seq', 20,
-                     'Maximum predictions per sequence_output.')
-flags.DEFINE_integer('train_batch_size', 32, 'Total batch size for training.')
-flags.DEFINE_integer('num_steps_per_epoch', 1000,
-                     'Total number of training steps to run per epoch.')
-flags.DEFINE_float('warmup_steps', 10000,
-                   'Warmup steps for Adam weight decay optimizer.')
-flags.DEFINE_bool('use_next_sentence_label', True,
-                  'Whether to use next sentence label to compute final loss.')
-flags.DEFINE_bool('train_summary_interval', 0, 'Step interval for training '
-                  'summaries. If the value is a negative number, '
-                  'then training summaries are not enabled.')
-
-common_flags.define_common_bert_flags()
-
-FLAGS = flags.FLAGS
-
-
-def get_pretrain_dataset_fn(input_file_pattern, seq_length,
-                            max_predictions_per_seq, global_batch_size,
-                            use_next_sentence_label=True):
-  """Returns input dataset from input file string."""
-  def _dataset_fn(ctx=None):
-    """Returns tf.data.Dataset for distributed BERT pretraining."""
-    input_patterns = input_file_pattern.split(',')
-    batch_size = ctx.get_per_replica_batch_size(global_batch_size)
-    train_dataset = input_pipeline.create_pretrain_dataset(
-        input_patterns,
-        seq_length,
-        max_predictions_per_seq,
-        batch_size,
-        is_training=True,
-        input_pipeline_context=ctx,
-        use_next_sentence_label=use_next_sentence_label)
-    return train_dataset
-
-  return _dataset_fn
-
-
-def get_loss_fn():
-  """Returns loss function for BERT pretraining."""
-
-  def _bert_pretrain_loss_fn(unused_labels, losses, **unused_args):
-    return tf.reduce_mean(losses)
-
-  return _bert_pretrain_loss_fn
-
-
-def run_customized_training(strategy,
-                            bert_config,
-                            init_checkpoint,
-                            max_seq_length,
-                            max_predictions_per_seq,
-                            model_dir,
-                            steps_per_epoch,
-                            steps_per_loop,
-                            epochs,
-                            initial_lr,
-                            warmup_steps,
-                            end_lr,
-                            optimizer_type,
-                            input_files,
-                            train_batch_size,
-                            use_next_sentence_label=True,
-                            train_summary_interval=0,
-                            custom_callbacks=None,
-                            explicit_allreduce=False,
-                            pre_allreduce_callbacks=None,
-                            post_allreduce_callbacks=None,
-                            allreduce_bytes_per_pack=0):
-  """Run BERT pretrain model training using low-level API."""
-
-  train_input_fn = get_pretrain_dataset_fn(input_files, max_seq_length,
-                                           max_predictions_per_seq,
-                                           train_batch_size,
-                                           use_next_sentence_label)
-
-  def _get_pretrain_model():
-    """Gets a pretraining model."""
-    pretrain_model, core_model = bert_models.pretrain_model(
-        bert_config, max_seq_length, max_predictions_per_seq,
-        use_next_sentence_label=use_next_sentence_label)
-    optimizer = optimization.create_optimizer(
-        initial_lr, steps_per_epoch * epochs, warmup_steps,
-        end_lr, optimizer_type)
-    pretrain_model.optimizer = performance.configure_optimizer(
-        optimizer,
-        use_float16=common_flags.use_float16(),
-        use_graph_rewrite=common_flags.use_graph_rewrite())
-    return pretrain_model, core_model
-
-  trained_model = model_training_utils.run_customized_training_loop(
-      strategy=strategy,
-      model_fn=_get_pretrain_model,
-      loss_fn=get_loss_fn(),
-      scale_loss=FLAGS.scale_loss,
-      model_dir=model_dir,
-      init_checkpoint=init_checkpoint,
-      train_input_fn=train_input_fn,
-      steps_per_epoch=steps_per_epoch,
-      steps_per_loop=steps_per_loop,
-      epochs=epochs,
-      sub_model_export_name='pretrained/bert_model',
-      explicit_allreduce=explicit_allreduce,
-      pre_allreduce_callbacks=pre_allreduce_callbacks,
-      post_allreduce_callbacks=post_allreduce_callbacks,
-      allreduce_bytes_per_pack=allreduce_bytes_per_pack,
-      train_summary_interval=train_summary_interval,
-      custom_callbacks=custom_callbacks)
-
-  return trained_model
-
-
-def run_bert_pretrain(strategy, custom_callbacks=None):
-  """Runs BERT pre-training."""
-
-  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-  if not strategy:
-    raise ValueError('Distribution strategy is not specified.')
-
-  # Runs customized training loop.
-  logging.info('Training using customized training loop TF 2.0 with distributed'
-               'strategy.')
-
-  performance.set_mixed_precision_policy(common_flags.dtype())
-
-  # Only when explicit_allreduce = True, post_allreduce_callbacks and
-  # allreduce_bytes_per_pack will take effect. optimizer.apply_gradients() no
-  # longer implicitly allreduce gradients, users manually allreduce gradient and
-  # pass the allreduced grads_and_vars to apply_gradients().
-  # With explicit_allreduce = True, clip_by_global_norm is moved to after
-  # allreduce.
-  return run_customized_training(
-      strategy,
-      bert_config,
-      FLAGS.init_checkpoint,  # Used to initialize only the BERT submodel.
-      FLAGS.max_seq_length,
-      FLAGS.max_predictions_per_seq,
-      FLAGS.model_dir,
-      FLAGS.num_steps_per_epoch,
-      FLAGS.steps_per_loop,
-      FLAGS.num_train_epochs,
-      FLAGS.learning_rate,
-      FLAGS.warmup_steps,
-      FLAGS.end_lr,
-      FLAGS.optimizer_type,
-      FLAGS.input_files,
-      FLAGS.train_batch_size,
-      FLAGS.use_next_sentence_label,
-      FLAGS.train_summary_interval,
-      custom_callbacks=custom_callbacks,
-      explicit_allreduce=FLAGS.explicit_allreduce,
-      pre_allreduce_callbacks=[
-          model_training_utils.clip_by_global_norm_callback
-      ],
-      allreduce_bytes_per_pack=FLAGS.allreduce_bytes_per_pack)
-
-
-def main(_):
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
-  if not FLAGS.model_dir:
-    FLAGS.model_dir = '/tmp/bert20/'
-  # Configures cluster spec for multi-worker distribution strategy.
-  if FLAGS.num_gpus > 0:
-    _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
-  strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=FLAGS.distribution_strategy,
-      num_gpus=FLAGS.num_gpus,
-      all_reduce_alg=FLAGS.all_reduce_alg,
-      tpu_address=FLAGS.tpu)
-  if strategy:
-    print('***** Number of cores used : ', strategy.num_replicas_in_sync)
-
-  run_bert_pretrain(strategy)
-
-
-if __name__ == '__main__':
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_squad.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_squad.py
deleted file mode 100644
index 31521003835f9695b3163b5fc13059f86d20cfa1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_squad.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Run BERT on SQuAD 1.1 and SQuAD 2.0 in TF 2.x."""
-
-import json
-import os
-import time
-
-import npu_device
-
-
-# Import libraries
-from absl import app
-from absl import flags
-from absl import logging
-import gin
-import tensorflow as tf
-from official.common import distribute_utils
-from official.nlp.bert import configs as bert_configs
-from official.nlp.bert import run_squad_helper
-from official.nlp.bert import tokenization
-from official.nlp.data import squad_lib as squad_lib_wp
-from official.utils.misc import keras_utils
-
-
-flags.DEFINE_string('vocab_file', None,
-                    'The vocabulary file that the BERT model was trained on.')
-flags.DEFINE_integer(name='train_steps', default=0,
-                    help='train steps')
-flags.DEFINE_boolean(name='use_fastgelu', default=True,
-                    help='whether to enable fastgelu, default is True')
-flags.DEFINE_string(name='precision_mode', default= 'allow_fp32_to_fp16',
-                    help='allow_fp32_to_fp16/force_fp16/ ' 
-                    'must_keep_origin_dtype/allow_mix_precision.')
-flags.DEFINE_boolean(name='over_dump', default=False,
-                    help='if or not over detection, default is False')
-flags.DEFINE_boolean(name='data_dump_flag', default=False,
-                    help='data dump flag, default is False')
-flags.DEFINE_string(name='data_dump_step', default="10",
-                    help='data dump step, default is 10')
-flags.DEFINE_boolean(name='profiling', default=False,
-                    help='if or not profiling for performance debug, default is False')
-flags.DEFINE_string(name='profiling_dump_path', default="/home/data",
-                    help='the path to save profiling data')
-flags.DEFINE_string(name='over_dump_path', default="/home/data",
-                    help='the path to save over dump data')
-flags.DEFINE_string(name='data_dump_path', default="/home/data",
-                    help='the path to save dump data')
-flags.DEFINE_boolean(name='use_mixlist', default=False,
-                    help='whether to enable mixlist, default is False')
-flags.DEFINE_boolean(name='fusion_off_flag', default=False,
-                    help='whether to enable fusion_off_flag, default is False')
-flags.DEFINE_string(name='mixlist_file', default='ops_info.json',
-                    help='mixlist file name, default is ops_info.json')
-flags.DEFINE_string(name='fusion_off_file', default='fusion_switch.cfg',
-                    help='fusion_off file name, default is fusion_switch.cfg')
-flags.DEFINE_boolean(name='auto_tune', default=False,
-                    help='whether to enable auto_tune, default is False')
-# More flags can be found in run_squad_helper.
-run_squad_helper.define_common_squad_flags()
-
-FLAGS = flags.FLAGS
-
-def npu_config():
-  FLAGS = flags.FLAGS
-  npu_config = {}
-
-  if FLAGS.data_dump_flag:
-    npu_device.global_options().dump_config.enable_dump = True
-    npu_device.global_options().dump_config.dump_path = FLAGS.data_dump_path
-    npu_device.global_options().dump_config.dump_step = FLAGS.data_dump_step
-    npu_device.global_options().dump_config.dump_mode = "all"
-
-  if FLAGS.over_dump:
-    npu_device.global_options().dump_config.enable_dump_debug = True
-    npu_device.global_options().dump_config.dump_path = FLAGS.over_dump_path
-    npu_device.global_options().dump_config.dump_debug_mode = "all"
-
-  if FLAGS.profiling:
-    npu_device.global_options().profiling_config.enable_profiling = True
-    profiling_options = '{"output":"' + FLAGS.profiling_dump_path + '", \
-                        "training_trace":"on", \
-                        "task_trace":"on", \
-                        "aicpu":"on", \
-                        "aic_metrics":"PipeUtilization",\
-                        "fp_point":"", \
-                        "bp_point":""}'
-    npu_device.global_options().profiling_config.profiling_options = profiling_options
-  npu_device.global_options().precision_mode=FLAGS.precision_mode
-  if FLAGS.use_mixlist and FLAGS.precision_mode=='allow_mix_precision':
-    npu_device.global_options().modify_mixlist=FLAGS.mixlist_file
-  if FLAGS.fusion_off_flag:
-    npu_device.global_options().fusion_switch_file=FLAGS.fusion_off_file
-  if FLAGS.auto_tune:
-    npu_device.global_options().auto_tune_mode="RL,GA"
-  npu_device.open().as_default()
-
-def train_squad(strategy,
-                input_meta_data,
-                custom_callbacks=None,
-                run_eagerly=False,
-                init_checkpoint=None,
-                sub_model_export_name=None):
-  """Run bert squad training."""
-  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-  init_checkpoint = init_checkpoint or FLAGS.init_checkpoint
-  run_squad_helper.train_squad(strategy, input_meta_data, bert_config,
-                               custom_callbacks, run_eagerly, init_checkpoint,
-                               sub_model_export_name=sub_model_export_name)
-
-
-def predict_squad(strategy, input_meta_data):
-  """Makes predictions for the squad dataset."""
-  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-  tokenizer = tokenization.FullTokenizer(
-      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
-  run_squad_helper.predict_squad(
-      strategy, input_meta_data, tokenizer, bert_config, squad_lib_wp)
-
-
-def eval_squad(strategy, input_meta_data):
-  """Evaluate on the squad dataset."""
-  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-  tokenizer = tokenization.FullTokenizer(
-      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
-  eval_metrics = run_squad_helper.eval_squad(
-      strategy, input_meta_data, tokenizer, bert_config, squad_lib_wp)
-  return eval_metrics
-
-
-def export_squad(model_export_path, input_meta_data):
-  """Exports a trained model as a `SavedModel` for inference.
-
-  Args:
-    model_export_path: a string specifying the path to the SavedModel directory.
-    input_meta_data: dictionary containing meta data about input and model.
-
-  Raises:
-    Export path is not specified, got an empty string or None.
-  """
-  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-  run_squad_helper.export_squad(model_export_path, input_meta_data, bert_config)
-
-
-def main(_):
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
-  npu_config()
-
-  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-    input_meta_data = json.loads(reader.read().decode('utf-8'))
-
-  if FLAGS.mode == 'export_only':
-    export_squad(FLAGS.model_export_path, input_meta_data)
-    return
-
-  # Configures cluster spec for multi-worker distribution strategy.
-  if FLAGS.num_gpus > 0:
-    _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
-  strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=FLAGS.distribution_strategy,
-      num_gpus=FLAGS.num_gpus,
-      all_reduce_alg=FLAGS.all_reduce_alg,
-      tpu_address=FLAGS.tpu)
-
-  if 'train' in FLAGS.mode:
-    if FLAGS.log_steps:
-      custom_callbacks = [keras_utils.TimeHistory(
-          batch_size=FLAGS.train_batch_size,
-          log_steps=FLAGS.log_steps,
-          logdir=FLAGS.model_dir,
-      )]
-    else:
-      custom_callbacks = None
-
-    train_squad(
-        strategy,
-        input_meta_data,
-        custom_callbacks=custom_callbacks,
-        run_eagerly=FLAGS.run_eagerly,
-        sub_model_export_name=FLAGS.sub_model_export_name,
-    )
-  if 'predict' in FLAGS.mode:
-    predict_squad(strategy, input_meta_data)
-  if 'eval' in FLAGS.mode:
-    eval_metrics = eval_squad(strategy, input_meta_data)
-    f1_score = eval_metrics['final_f1']
-    logging.info('SQuAD eval F1-score: %f', f1_score)
-    summary_dir = os.path.join(FLAGS.model_dir, 'summaries', 'eval')
-    summary_writer = tf.summary.create_file_writer(summary_dir)
-    with summary_writer.as_default():
-      # TODO(lehou): write to the correct step number.
-      tf.summary.scalar('F1-score', f1_score, step=0)
-      summary_writer.flush()
-    # Also write eval_metrics to json file.
-    squad_lib_wp.write_to_json_files(
-        eval_metrics, os.path.join(summary_dir, 'eval_metrics.json'))
-    time.sleep(60)
-
-
-if __name__ == '__main__':
-  flags.mark_flag_as_required('bert_config_file')
-  flags.mark_flag_as_required('model_dir')
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_squad_helper.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_squad_helper.py
deleted file mode 100644
index 385f94ca7a3b891b540041252d1858f42a254294..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/run_squad_helper.py
+++ /dev/null
@@ -1,489 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Library for running BERT family models on SQuAD 1.1/2.0 in TF 2.x."""
-
-import collections
-import json
-import os
-
-from absl import flags
-from absl import logging
-import tensorflow as tf
-from official.modeling import performance
-from official.nlp import optimization
-from official.nlp.bert import bert_models
-from official.nlp.bert import common_flags
-from official.nlp.bert import input_pipeline
-from official.nlp.bert import model_saving_utils
-from official.nlp.bert import model_training_utils
-from official.nlp.bert import squad_evaluate_v1_1
-from official.nlp.bert import squad_evaluate_v2_0
-from official.nlp.data import squad_lib_sp
-from official.utils.misc import keras_utils
-
-
-def define_common_squad_flags():
-  """Defines common flags used by SQuAD tasks."""
-  flags.DEFINE_enum(
-      'mode', 'train_and_eval', [
-          'train_and_eval', 'train_and_predict', 'train', 'eval', 'predict',
-          'export_only'
-      ], 'One of {"train_and_eval", "train_and_predict", '
-      '"train", "eval", "predict", "export_only"}. '
-      '`train_and_eval`: train & predict to json files & compute eval metrics. '
-      '`train_and_predict`: train & predict to json files. '
-      '`train`: only trains the model. '
-      '`eval`: predict answers from squad json file & compute eval metrics. '
-      '`predict`: predict answers from the squad json file. '
-      '`export_only`: will take the latest checkpoint inside '
-      'model_dir and export a `SavedModel`.')
-  flags.DEFINE_string('train_data_path', '',
-                      'Training data path with train tfrecords.')
-  flags.DEFINE_string(
-      'input_meta_data_path', None,
-      'Path to file that contains meta data about input '
-      'to be used for training and evaluation.')
-  # Model training specific flags.
-  flags.DEFINE_integer('train_batch_size', 32, 'Total batch size for training.')
-  # Predict processing related.
-  flags.DEFINE_string(
-      'predict_file', None, 'SQuAD prediction json file path. '
-      '`predict` mode supports multiple files: one can use '
-      'wildcard to specify multiple files and it can also be '
-      'multiple file patterns separated by comma. Note that '
-      '`eval` mode only supports a single predict file.')
-  flags.DEFINE_bool(
-      'do_lower_case', True,
-      'Whether to lower case the input text. Should be True for uncased '
-      'models and False for cased models.')
-  flags.DEFINE_float(
-      'null_score_diff_threshold', 0.0,
-      'If null_score - best_non_null is greater than the threshold, '
-      'predict null. This is only used for SQuAD v2.')
-  flags.DEFINE_bool(
-      'verbose_logging', False,
-      'If true, all of the warnings related to data processing will be '
-      'printed. A number of warnings are expected for a normal SQuAD '
-      'evaluation.')
-  flags.DEFINE_integer('predict_batch_size', 8,
-                       'Total batch size for prediction.')
-  flags.DEFINE_integer(
-      'n_best_size', 20,
-      'The total number of n-best predictions to generate in the '
-      'nbest_predictions.json output file.')
-  flags.DEFINE_integer(
-      'max_answer_length', 30,
-      'The maximum length of an answer that can be generated. This is needed '
-      'because the start and end predictions are not conditioned on one '
-      'another.')
-
-  common_flags.define_common_bert_flags()
-
-
-FLAGS = flags.FLAGS
-
-
-def squad_loss_fn(start_positions, end_positions, start_logits, end_logits):
-  """Returns sparse categorical crossentropy for start/end logits."""
-  start_loss = tf.keras.losses.sparse_categorical_crossentropy(
-      start_positions, start_logits, from_logits=True)
-  end_loss = tf.keras.losses.sparse_categorical_crossentropy(
-      end_positions, end_logits, from_logits=True)
-
-  total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
-  return total_loss
-
-
-def get_loss_fn():
-  """Gets a loss function for squad task."""
-
-  def _loss_fn(labels, model_outputs):
-    start_positions = labels['start_positions']
-    end_positions = labels['end_positions']
-    start_logits, end_logits = model_outputs
-    return squad_loss_fn(start_positions, end_positions, start_logits,
-                         end_logits)
-
-  return _loss_fn
-
-
-RawResult = collections.namedtuple('RawResult',
-                                   ['unique_id', 'start_logits', 'end_logits'])
-
-
-def get_raw_results(predictions):
-  """Converts multi-replica predictions to RawResult."""
-  for unique_ids, start_logits, end_logits in zip(predictions['unique_ids'],
-                                                  predictions['start_logits'],
-                                                  predictions['end_logits']):
-    for values in zip(unique_ids.numpy(), start_logits.numpy(),
-                      end_logits.numpy()):
-      yield RawResult(
-          unique_id=values[0],
-          start_logits=values[1].tolist(),
-          end_logits=values[2].tolist())
-
-
-def get_dataset_fn(input_file_pattern, max_seq_length, global_batch_size,
-                   is_training):
-  """Gets a closure to create a dataset.."""
-
-  def _dataset_fn(ctx=None):
-    """Returns tf.data.Dataset for distributed BERT pretraining."""
-    batch_size = ctx.get_per_replica_batch_size(
-        global_batch_size) if ctx else global_batch_size
-    dataset = input_pipeline.create_squad_dataset(
-        input_file_pattern,
-        max_seq_length,
-        batch_size,
-        is_training=is_training,
-        input_pipeline_context=ctx)
-    return dataset
-
-  return _dataset_fn
-
-
-def get_squad_model_to_predict(strategy, bert_config, checkpoint_path,
-                               input_meta_data):
-  """Gets a squad model to make predictions."""
-  with strategy.scope():
-    # Prediction always uses float32, even if training uses mixed precision.
-    tf.keras.mixed_precision.set_global_policy('float32')
-    squad_model, _ = bert_models.squad_model(
-        bert_config,
-        input_meta_data['max_seq_length'],
-        hub_module_url=FLAGS.hub_module_url)
-
-  if checkpoint_path is None:
-    checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir)
-  logging.info('Restoring checkpoints from %s', checkpoint_path)
-  checkpoint = tf.train.Checkpoint(model=squad_model)
-  checkpoint.restore(checkpoint_path).expect_partial()
-  return squad_model
-
-
-def predict_squad_customized(strategy, input_meta_data, predict_tfrecord_path,
-                             num_steps, squad_model):
-  """Make predictions using a Bert-based squad model."""
-  predict_dataset_fn = get_dataset_fn(
-      predict_tfrecord_path,
-      input_meta_data['max_seq_length'],
-      FLAGS.predict_batch_size,
-      is_training=False)
-  predict_iterator = iter(
-      strategy.distribute_datasets_from_function(predict_dataset_fn))
-
-  @tf.function
-  def predict_step(iterator):
-    """Predicts on distributed devices."""
-
-    def _replicated_step(inputs):
-      """Replicated prediction calculation."""
-      x, _ = inputs
-      unique_ids = x.pop('unique_ids')
-      start_logits, end_logits = squad_model(x, training=False)
-      return dict(
-          unique_ids=unique_ids,
-          start_logits=start_logits,
-          end_logits=end_logits)
-
-    outputs = strategy.run(_replicated_step, args=(next(iterator),))
-    return tf.nest.map_structure(strategy.experimental_local_results, outputs)
-
-  all_results = []
-  for _ in range(num_steps):
-    predictions = predict_step(predict_iterator)
-    for result in get_raw_results(predictions):
-      all_results.append(result)
-    if len(all_results) % 100 == 0:
-      logging.info('Made predictions for %d records.', len(all_results))
-  return all_results
-
-
-def train_squad(strategy,
-                input_meta_data,
-                bert_config,
-                custom_callbacks=None,
-                run_eagerly=False,
-                init_checkpoint=None,
-                sub_model_export_name=None):
-  """Run bert squad training."""
-  if strategy:
-    logging.info('Training using customized training loop with distribution'
-                 ' strategy.')
-  # Enables XLA in Session Config. Should not be set for TPU.
-  keras_utils.set_session_config(FLAGS.enable_xla)
-  performance.set_mixed_precision_policy(common_flags.dtype())
-
-  epochs = FLAGS.num_train_epochs
-  num_train_examples = input_meta_data['train_data_size']
-  max_seq_length = input_meta_data['max_seq_length']
-  steps_per_epoch = int(num_train_examples / FLAGS.train_batch_size) if not FLAGS.train_steps else FLAGS.train_steps
-  logging.info('steps_per_epoch: %d', steps_per_epoch)
-  warmup_steps = int(epochs * num_train_examples * 0.1 / FLAGS.train_batch_size)
-  train_input_fn = get_dataset_fn(
-      FLAGS.train_data_path,
-      max_seq_length,
-      FLAGS.train_batch_size,
-      is_training=True)
-
-  def _get_squad_model():
-    """Get Squad model and optimizer."""
-    squad_model, core_model = bert_models.squad_model(
-        bert_config,
-        max_seq_length,
-        hub_module_url=FLAGS.hub_module_url,
-        hub_module_trainable=FLAGS.hub_module_trainable)
-    optimizer = optimization.create_optimizer(FLAGS.learning_rate,
-                                              steps_per_epoch * epochs,
-                                              warmup_steps, FLAGS.end_lr,
-                                              FLAGS.optimizer_type)
-
-    squad_model.optimizer = performance.configure_optimizer(
-        optimizer,
-        use_float16=common_flags.use_float16(),
-        use_graph_rewrite=common_flags.use_graph_rewrite())
-    return squad_model, core_model
-
-  # Only when explicit_allreduce = True, post_allreduce_callbacks and
-  # allreduce_bytes_per_pack will take effect. optimizer.apply_gradients() no
-  # longer implicitly allreduce gradients, users manually allreduce gradient and
-  # pass the allreduced grads_and_vars to apply_gradients().
-  # With explicit_allreduce = True, clip_by_global_norm is moved to after
-  # allreduce.
-  model_training_utils.run_customized_training_loop(
-      strategy=strategy,
-      model_fn=_get_squad_model,
-      loss_fn=get_loss_fn(),
-      model_dir=FLAGS.model_dir,
-      steps_per_epoch=steps_per_epoch,
-      steps_per_loop=FLAGS.steps_per_loop,
-      epochs=epochs,
-      train_input_fn=train_input_fn,
-      init_checkpoint=init_checkpoint or FLAGS.init_checkpoint,
-      sub_model_export_name=sub_model_export_name,
-      run_eagerly=run_eagerly,
-      custom_callbacks=custom_callbacks,
-      explicit_allreduce=FLAGS.explicit_allreduce,
-      pre_allreduce_callbacks=[
-          model_training_utils.clip_by_global_norm_callback
-      ],
-      allreduce_bytes_per_pack=FLAGS.allreduce_bytes_per_pack)
-
-
-def prediction_output_squad(strategy, input_meta_data, tokenizer, squad_lib,
-                            predict_file, squad_model):
-  """Makes predictions for a squad dataset."""
-  doc_stride = input_meta_data['doc_stride']
-  max_query_length = input_meta_data['max_query_length']
-  # Whether data should be in Ver 2.0 format.
-  version_2_with_negative = input_meta_data.get('version_2_with_negative',
-                                                False)
-  eval_examples = squad_lib.read_squad_examples(
-      input_file=predict_file,
-      is_training=False,
-      version_2_with_negative=version_2_with_negative)
-
-  eval_writer = squad_lib.FeatureWriter(
-      filename=os.path.join(FLAGS.model_dir, 'eval.tf_record'),
-      is_training=False)
-  eval_features = []
-
-  def _append_feature(feature, is_padding):
-    if not is_padding:
-      eval_features.append(feature)
-    eval_writer.process_feature(feature)
-
-  # TPU requires a fixed batch size for all batches, therefore the number
-  # of examples must be a multiple of the batch size, or else examples
-  # will get dropped. So we pad with fake examples which are ignored
-  # later on.
-  kwargs = dict(
-      examples=eval_examples,
-      tokenizer=tokenizer,
-      max_seq_length=input_meta_data['max_seq_length'],
-      doc_stride=doc_stride,
-      max_query_length=max_query_length,
-      is_training=False,
-      output_fn=_append_feature,
-      batch_size=FLAGS.predict_batch_size)
-
-  # squad_lib_sp requires one more argument 'do_lower_case'.
-  if squad_lib == squad_lib_sp:
-    kwargs['do_lower_case'] = FLAGS.do_lower_case
-  dataset_size = squad_lib.convert_examples_to_features(**kwargs)
-  eval_writer.close()
-
-  logging.info('***** Running predictions *****')
-  logging.info('  Num orig examples = %d', len(eval_examples))
-  logging.info('  Num split examples = %d', len(eval_features))
-  logging.info('  Batch size = %d', FLAGS.predict_batch_size)
-
-  num_steps = int(dataset_size / FLAGS.predict_batch_size)
-  all_results = predict_squad_customized(strategy, input_meta_data,
-                                         eval_writer.filename, num_steps,
-                                         squad_model)
-
-  all_predictions, all_nbest_json, scores_diff_json = (
-      squad_lib.postprocess_output(
-          eval_examples,
-          eval_features,
-          all_results,
-          FLAGS.n_best_size,
-          FLAGS.max_answer_length,
-          FLAGS.do_lower_case,
-          version_2_with_negative=version_2_with_negative,
-          null_score_diff_threshold=FLAGS.null_score_diff_threshold,
-          verbose=FLAGS.verbose_logging))
-
-  return all_predictions, all_nbest_json, scores_diff_json
-
-
-def dump_to_files(all_predictions,
-                  all_nbest_json,
-                  scores_diff_json,
-                  squad_lib,
-                  version_2_with_negative,
-                  file_prefix=''):
-  """Save output to json files."""
-  output_prediction_file = os.path.join(FLAGS.model_dir,
-                                        '%spredictions.json' % file_prefix)
-  output_nbest_file = os.path.join(FLAGS.model_dir,
-                                   '%snbest_predictions.json' % file_prefix)
-  output_null_log_odds_file = os.path.join(FLAGS.model_dir, file_prefix,
-                                           '%snull_odds.json' % file_prefix)
-  logging.info('Writing predictions to: %s', (output_prediction_file))
-  logging.info('Writing nbest to: %s', (output_nbest_file))
-
-  squad_lib.write_to_json_files(all_predictions, output_prediction_file)
-  squad_lib.write_to_json_files(all_nbest_json, output_nbest_file)
-  if version_2_with_negative:
-    squad_lib.write_to_json_files(scores_diff_json, output_null_log_odds_file)
-
-
-def _get_matched_files(input_path):
-  """Returns all files that matches the input_path."""
-  input_patterns = input_path.strip().split(',')
-  all_matched_files = []
-  for input_pattern in input_patterns:
-    input_pattern = input_pattern.strip()
-    if not input_pattern:
-      continue
-    matched_files = tf.io.gfile.glob(input_pattern)
-    if not matched_files:
-      raise ValueError('%s does not match any files.' % input_pattern)
-    else:
-      all_matched_files.extend(matched_files)
-  return sorted(all_matched_files)
-
-
-def predict_squad(strategy,
-                  input_meta_data,
-                  tokenizer,
-                  bert_config,
-                  squad_lib,
-                  init_checkpoint=None):
-  """Get prediction results and evaluate them to hard drive."""
-  if init_checkpoint is None:
-    init_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)
-
-  all_predict_files = _get_matched_files(FLAGS.predict_file)
-  squad_model = get_squad_model_to_predict(strategy, bert_config,
-                                           init_checkpoint, input_meta_data)
-  for idx, predict_file in enumerate(all_predict_files):
-    all_predictions, all_nbest_json, scores_diff_json = prediction_output_squad(
-        strategy, input_meta_data, tokenizer, squad_lib, predict_file,
-        squad_model)
-    if len(all_predict_files) == 1:
-      file_prefix = ''
-    else:
-      # if predict_file is /path/xquad.ar.json, the `file_prefix` may be
-      # "xquad.ar-0-"
-      file_prefix = '%s-' % os.path.splitext(
-          os.path.basename(all_predict_files[idx]))[0]
-    dump_to_files(all_predictions, all_nbest_json, scores_diff_json, squad_lib,
-                  input_meta_data.get('version_2_with_negative', False),
-                  file_prefix)
-
-
-def eval_squad(strategy,
-               input_meta_data,
-               tokenizer,
-               bert_config,
-               squad_lib,
-               init_checkpoint=None):
-  """Get prediction results and evaluate them against ground truth."""
-  if init_checkpoint is None:
-    init_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)
-
-  all_predict_files = _get_matched_files(FLAGS.predict_file)
-  if len(all_predict_files) != 1:
-    raise ValueError('`eval_squad` only supports one predict file, '
-                     'but got %s' % all_predict_files)
-
-  squad_model = get_squad_model_to_predict(strategy, bert_config,
-                                           init_checkpoint, input_meta_data)
-  all_predictions, all_nbest_json, scores_diff_json = prediction_output_squad(
-      strategy, input_meta_data, tokenizer, squad_lib, all_predict_files[0],
-      squad_model)
-  dump_to_files(all_predictions, all_nbest_json, scores_diff_json, squad_lib,
-                input_meta_data.get('version_2_with_negative', False))
-
-  with tf.io.gfile.GFile(FLAGS.predict_file, 'r') as reader:
-    dataset_json = json.load(reader)
-    pred_dataset = dataset_json['data']
-  if input_meta_data.get('version_2_with_negative', False):
-    eval_metrics = squad_evaluate_v2_0.evaluate(pred_dataset, all_predictions,
-                                                scores_diff_json)
-  else:
-    eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions)
-  return eval_metrics
-
-
-def export_squad(model_export_path, input_meta_data, bert_config):
-  """Exports a trained model as a `SavedModel` for inference.
-
-  Args:
-    model_export_path: a string specifying the path to the SavedModel directory.
-    input_meta_data: dictionary containing meta data about input and model.
-    bert_config: Bert configuration file to define core bert layers.
-
-  Raises:
-    Export path is not specified, got an empty string or None.
-  """
-  if not model_export_path:
-    raise ValueError('Export path is not specified: %s' % model_export_path)
-  # Export uses float32 for now, even if training uses mixed precision.
-  tf.keras.mixed_precision.set_global_policy('float32')
-  squad_model, _ = bert_models.squad_model(bert_config,
-                                           input_meta_data['max_seq_length'])
-  model_saving_utils.export_bert_model(
-      model_export_path, model=squad_model, checkpoint_dir=FLAGS.model_dir)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/serving.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/serving.py
deleted file mode 100644
index cac5425123504666b7721659be25c64ae52ea024..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/serving.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Examples of SavedModel export for tf-serving."""
-
-from absl import app
-from absl import flags
-import tensorflow as tf
-
-from official.nlp.bert import bert_models
-from official.nlp.bert import configs
-
-flags.DEFINE_integer(
-    "sequence_length", None, "Sequence length to parse the tf.Example. If "
-    "sequence_length > 0, add a signature for serialized "
-    "tf.Example and define the parsing specification by the "
-    "sequence_length.")
-flags.DEFINE_string("bert_config_file", None,
-                    "Bert configuration file to define core bert layers.")
-flags.DEFINE_string("model_checkpoint_path", None,
-                    "File path to TF model checkpoint.")
-flags.DEFINE_string("export_path", None,
-                    "Destination folder to export the serving SavedModel.")
-
-FLAGS = flags.FLAGS
-
-
-class BertServing(tf.keras.Model):
-  """Bert transformer encoder model for serving."""
-
-  def __init__(self, bert_config, name_to_features=None, name="serving_model"):
-    super(BertServing, self).__init__(name=name)
-    self.encoder = bert_models.get_transformer_encoder(
-        bert_config, sequence_length=None)
-    self.name_to_features = name_to_features
-
-  def call(self, inputs):
-    input_word_ids = inputs["input_ids"]
-    input_mask = inputs["input_mask"]
-    input_type_ids = inputs["segment_ids"]
-
-    encoder_outputs, _ = self.encoder(
-        [input_word_ids, input_mask, input_type_ids])
-    return encoder_outputs
-
-  def serve_body(self, input_ids, input_mask=None, segment_ids=None):
-    if segment_ids is None:
-      # Requires CLS token is the first token of inputs.
-      segment_ids = tf.zeros_like(input_ids)
-    if input_mask is None:
-      # The mask has 1 for real tokens and 0 for padding tokens.
-      input_mask = tf.where(
-          tf.equal(input_ids, 0), tf.zeros_like(input_ids),
-          tf.ones_like(input_ids))
-
-    inputs = dict(
-        input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids)
-    return self.call(inputs)
-
-  @tf.function
-  def serve(self, input_ids, input_mask=None, segment_ids=None):
-    outputs = self.serve_body(input_ids, input_mask, segment_ids)
-    # Returns a dictionary to control SignatureDef output signature.
-    return {"outputs": outputs[-1]}
-
-  @tf.function
-  def serve_examples(self, inputs):
-    features = tf.io.parse_example(inputs, self.name_to_features)
-    for key in list(features.keys()):
-      t = features[key]
-      if t.dtype == tf.int64:
-        t = tf.cast(t, tf.int32)
-      features[key] = t
-    return self.serve(
-        features["input_ids"],
-        input_mask=features["input_mask"] if "input_mask" in features else None,
-        segment_ids=features["segment_ids"]
-        if "segment_ids" in features else None)
-
-  @classmethod
-  def export(cls, model, export_dir):
-    if not isinstance(model, cls):
-      raise ValueError("Invalid model instance: %s, it should be a %s" %
-                       (model, cls))
-
-    signatures = {
-        "serving_default":
-            model.serve.get_concrete_function(
-                input_ids=tf.TensorSpec(
-                    shape=[None, None], dtype=tf.int32, name="inputs")),
-    }
-    if model.name_to_features:
-      signatures[
-          "serving_examples"] = model.serve_examples.get_concrete_function(
-              tf.TensorSpec(shape=[None], dtype=tf.string, name="examples"))
-    tf.saved_model.save(model, export_dir=export_dir, signatures=signatures)
-
-
-def main(_):
-  sequence_length = FLAGS.sequence_length
-  if sequence_length is not None and sequence_length > 0:
-    name_to_features = {
-        "input_ids": tf.io.FixedLenFeature([sequence_length], tf.int64),
-        "input_mask": tf.io.FixedLenFeature([sequence_length], tf.int64),
-        "segment_ids": tf.io.FixedLenFeature([sequence_length], tf.int64),
-    }
-  else:
-    name_to_features = None
-  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-  serving_model = BertServing(
-      bert_config=bert_config, name_to_features=name_to_features)
-  checkpoint = tf.train.Checkpoint(model=serving_model.encoder)
-  checkpoint.restore(FLAGS.model_checkpoint_path
-                    ).assert_existing_objects_matched().run_restore_ops()
-  BertServing.export(serving_model, FLAGS.export_path)
-
-
-if __name__ == "__main__":
-  flags.mark_flag_as_required("bert_config_file")
-  flags.mark_flag_as_required("model_checkpoint_path")
-  flags.mark_flag_as_required("export_path")
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/squad_evaluate_v1_1.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/squad_evaluate_v1_1.py
deleted file mode 100644
index cada87b5d88c9303ed8ef395ad0eddd34d27c6bf..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/squad_evaluate_v1_1.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Evaluation of SQuAD predictions (version 1.1).
-
-The functions are copied from
-https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/.
-
-The SQuAD dataset is described in this paper:
-SQuAD: 100,000+ Questions for Machine Comprehension of Text
-Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, Percy Liang
-https://nlp.stanford.edu/pubs/rajpurkar2016squad.pdf
-"""
-
-import collections
-import re
-import string
-
-# pylint: disable=g-bad-import-order
-
-from absl import logging
-# pylint: enable=g-bad-import-order
-
-
-def _normalize_answer(s):
-  """Lowers text and remove punctuation, articles and extra whitespace."""
-
-  def remove_articles(text):
-    return re.sub(r"\b(a|an|the)\b", " ", text)
-
-  def white_space_fix(text):
-    return " ".join(text.split())
-
-  def remove_punc(text):
-    exclude = set(string.punctuation)
-    return "".join(ch for ch in text if ch not in exclude)
-
-  def lower(text):
-    return text.lower()
-
-  return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def _f1_score(prediction, ground_truth):
-  """Computes F1 score by comparing prediction to ground truth."""
-  prediction_tokens = _normalize_answer(prediction).split()
-  ground_truth_tokens = _normalize_answer(ground_truth).split()
-  prediction_counter = collections.Counter(prediction_tokens)
-  ground_truth_counter = collections.Counter(ground_truth_tokens)
-  common = prediction_counter & ground_truth_counter
-  num_same = sum(common.values())
-  if num_same == 0:
-    return 0
-  precision = 1.0 * num_same / len(prediction_tokens)
-  recall = 1.0 * num_same / len(ground_truth_tokens)
-  f1 = (2 * precision * recall) / (precision + recall)
-  return f1
-
-
-def _exact_match_score(prediction, ground_truth):
-  """Checks if predicted answer exactly matches ground truth answer."""
-  return _normalize_answer(prediction) == _normalize_answer(ground_truth)
-
-
-def _metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
-  """Computes the max over all metric scores."""
-  scores_for_ground_truths = []
-  for ground_truth in ground_truths:
-    score = metric_fn(prediction, ground_truth)
-    scores_for_ground_truths.append(score)
-  return max(scores_for_ground_truths)
-
-
-def evaluate(dataset, predictions):
-  """Evaluates predictions for a dataset."""
-  f1 = exact_match = total = 0
-  for article in dataset:
-    for paragraph in article["paragraphs"]:
-      for qa in paragraph["qas"]:
-        total += 1
-        if qa["id"] not in predictions:
-          message = "Unanswered question " + qa["id"] + " will receive score 0."
-          logging.error(message)
-          continue
-        ground_truths = [entry["text"] for entry in qa["answers"]]
-        prediction = predictions[qa["id"]]
-        exact_match += _metric_max_over_ground_truths(_exact_match_score,
-                                                      prediction, ground_truths)
-        f1 += _metric_max_over_ground_truths(_f1_score, prediction,
-                                             ground_truths)
-
-  exact_match = exact_match / total
-  f1 = f1 / total
-
-  return {"exact_match": exact_match, "final_f1": f1}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/squad_evaluate_v2_0.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/squad_evaluate_v2_0.py
deleted file mode 100644
index 1a325f7ec569a8a15526a693b0f087283b04854f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/squad_evaluate_v2_0.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Evaluation script for SQuAD version 2.0.
-
-The functions are copied and modified from
-https://raw.githubusercontent.com/white127/SQUAD-2.0-bidaf/master/evaluate-v2.0.py
-
-In addition to basic functionality, we also compute additional statistics and
-plot precision-recall curves if an additional na_prob.json file is provided.
-This file is expected to map question ID's to the model's predicted probability
-that a question is unanswerable.
-"""
-
-import collections
-import re
-import string
-
-from absl import logging
-
-
-def _make_qid_to_has_ans(dataset):
-  qid_to_has_ans = {}
-  for article in dataset:
-    for p in article['paragraphs']:
-      for qa in p['qas']:
-        qid_to_has_ans[qa['id']] = bool(qa['answers'])
-  return qid_to_has_ans
-
-
-def _normalize_answer(s):
-  """Lower text and remove punctuation, articles and extra whitespace."""
-  def remove_articles(text):
-    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
-    return re.sub(regex, ' ', text)
-  def white_space_fix(text):
-    return ' '.join(text.split())
-  def remove_punc(text):
-    exclude = set(string.punctuation)
-    return ''.join(ch for ch in text if ch not in exclude)
-  def lower(text):
-    return text.lower()
-  return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def _get_tokens(s):
-  if not s: return []
-  return _normalize_answer(s).split()
-
-
-def _compute_exact(a_gold, a_pred):
-  return int(_normalize_answer(a_gold) == _normalize_answer(a_pred))
-
-
-def _compute_f1(a_gold, a_pred):
-  """Compute F1-score."""
-  gold_toks = _get_tokens(a_gold)
-  pred_toks = _get_tokens(a_pred)
-  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
-  num_same = sum(common.values())
-  if not gold_toks or not pred_toks:
-    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
-    return int(gold_toks == pred_toks)
-  if num_same == 0:
-    return 0
-  precision = 1.0 * num_same / len(pred_toks)
-  recall = 1.0 * num_same / len(gold_toks)
-  f1 = (2 * precision * recall) / (precision + recall)
-  return f1
-
-
-def _get_raw_scores(dataset, predictions):
-  """Compute raw scores."""
-  exact_scores = {}
-  f1_scores = {}
-  for article in dataset:
-    for p in article['paragraphs']:
-      for qa in p['qas']:
-        qid = qa['id']
-        gold_answers = [a['text'] for a in qa['answers']
-                        if _normalize_answer(a['text'])]
-        if not gold_answers:
-          # For unanswerable questions, only correct answer is empty string
-          gold_answers = ['']
-        if qid not in predictions:
-          logging.error('Missing prediction for %s', qid)
-          continue
-        a_pred = predictions[qid]
-        # Take max over all gold answers
-        exact_scores[qid] = max(_compute_exact(a, a_pred) for a in gold_answers)
-        f1_scores[qid] = max(_compute_f1(a, a_pred) for a in gold_answers)
-  return exact_scores, f1_scores
-
-
-def _apply_no_ans_threshold(
-    scores, na_probs, qid_to_has_ans, na_prob_thresh=1.0):
-  new_scores = {}
-  for qid, s in scores.items():
-    pred_na = na_probs[qid] > na_prob_thresh
-    if pred_na:
-      new_scores[qid] = float(not qid_to_has_ans[qid])
-    else:
-      new_scores[qid] = s
-  return new_scores
-
-
-def _make_eval_dict(exact_scores, f1_scores, qid_list=None):
-  """Make evaluation result dictionary."""
-  if not qid_list:
-    total = len(exact_scores)
-    return collections.OrderedDict([
-        ('exact', 100.0 * sum(exact_scores.values()) / total),
-        ('f1', 100.0 * sum(f1_scores.values()) / total),
-        ('total', total),
-    ])
-  else:
-    total = len(qid_list)
-    return collections.OrderedDict([
-        ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
-        ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
-        ('total', total),
-    ])
-
-
-def _merge_eval(main_eval, new_eval, prefix):
-  for k in new_eval:
-    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
-
-
-def _make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans):
-  """Make evaluation dictionary containing average recision recall."""
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  true_pos = 0.0
-  cur_p = 1.0
-  cur_r = 0.0
-  precisions = [1.0]
-  recalls = [0.0]
-  avg_prec = 0.0
-  for i, qid in enumerate(qid_list):
-    if qid_to_has_ans[qid]:
-      true_pos += scores[qid]
-    cur_p = true_pos / float(i+1)
-    cur_r = true_pos / float(num_true_pos)
-    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
-      # i.e., if we can put a threshold after this point
-      avg_prec += cur_p * (cur_r - recalls[-1])
-      precisions.append(cur_p)
-      recalls.append(cur_r)
-  return {'ap': 100.0 * avg_prec}
-
-
-def _run_precision_recall_analysis(
-    main_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-  """Run precision recall analysis and return result dictionary."""
-  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
-  if num_true_pos == 0:
-    return
-  pr_exact = _make_precision_recall_eval(
-      exact_raw, na_probs, num_true_pos, qid_to_has_ans)
-  pr_f1 = _make_precision_recall_eval(
-      f1_raw, na_probs, num_true_pos, qid_to_has_ans)
-  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
-  pr_oracle = _make_precision_recall_eval(
-      oracle_scores, na_probs, num_true_pos, qid_to_has_ans)
-  _merge_eval(main_eval, pr_exact, 'pr_exact')
-  _merge_eval(main_eval, pr_f1, 'pr_f1')
-  _merge_eval(main_eval, pr_oracle, 'pr_oracle')
-
-
-def _find_best_thresh(predictions, scores, na_probs, qid_to_has_ans):
-  """Find the best threshold for no answer probability."""
-  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
-  cur_score = num_no_ans
-  best_score = cur_score
-  best_thresh = 0.0
-  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
-  for qid in qid_list:
-    if qid not in scores: continue
-    if qid_to_has_ans[qid]:
-      diff = scores[qid]
-    else:
-      if predictions[qid]:
-        diff = -1
-      else:
-        diff = 0
-    cur_score += diff
-    if cur_score > best_score:
-      best_score = cur_score
-      best_thresh = na_probs[qid]
-  return 100.0 * best_score / len(scores), best_thresh
-
-
-def _find_all_best_thresh(
-    main_eval, predictions, exact_raw, f1_raw, na_probs, qid_to_has_ans):
-  best_exact, exact_thresh = _find_best_thresh(
-      predictions, exact_raw, na_probs, qid_to_has_ans)
-  best_f1, f1_thresh = _find_best_thresh(
-      predictions, f1_raw, na_probs, qid_to_has_ans)
-  main_eval['final_exact'] = best_exact
-  main_eval['final_exact_thresh'] = exact_thresh
-  main_eval['final_f1'] = best_f1
-  main_eval['final_f1_thresh'] = f1_thresh
-
-
-def evaluate(dataset, predictions, na_probs=None):
-  """Evaluate prediction results."""
-  new_orig_data = []
-  for article in dataset:
-    for p in article['paragraphs']:
-      for qa in p['qas']:
-        if qa['id'] in predictions:
-          new_para = {'qas': [qa]}
-          new_article = {'paragraphs': [new_para]}
-          new_orig_data.append(new_article)
-  dataset = new_orig_data
-
-  if na_probs is None:
-    na_probs = {k: 0.0 for k in predictions}
-  qid_to_has_ans = _make_qid_to_has_ans(dataset)  # maps qid to True/False
-  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
-  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
-  exact_raw, f1_raw = _get_raw_scores(dataset, predictions)
-  exact_thresh = _apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans)
-  f1_thresh = _apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans)
-  out_eval = _make_eval_dict(exact_thresh, f1_thresh)
-  if has_ans_qids:
-    has_ans_eval = _make_eval_dict(
-        exact_thresh, f1_thresh, qid_list=has_ans_qids)
-    _merge_eval(out_eval, has_ans_eval, 'HasAns')
-  if no_ans_qids:
-    no_ans_eval = _make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
-    _merge_eval(out_eval, no_ans_eval, 'NoAns')
-
-  _find_all_best_thresh(
-      out_eval, predictions, exact_raw, f1_raw, na_probs, qid_to_has_ans)
-  _run_precision_recall_analysis(
-      out_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans)
-  return out_eval
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tf1_checkpoint_converter_lib.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tf1_checkpoint_converter_lib.py
deleted file mode 100644
index ba6e593be16db3e6396a62f226171bbd1be0db97..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tf1_checkpoint_converter_lib.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-r"""Convert checkpoints created by Estimator (tf1) to be Keras compatible."""
-
-import numpy as np
-import tensorflow.compat.v1 as tf  # TF 1.x
-
-# Mapping between old <=> new names. The source pattern in original variable
-# name will be replaced by destination pattern.
-BERT_NAME_REPLACEMENTS = (
-    ("bert", "bert_model"),
-    ("embeddings/word_embeddings", "word_embeddings/embeddings"),
-    ("embeddings/token_type_embeddings",
-     "embedding_postprocessor/type_embeddings"),
-    ("embeddings/position_embeddings",
-     "embedding_postprocessor/position_embeddings"),
-    ("embeddings/LayerNorm", "embedding_postprocessor/layer_norm"),
-    ("attention/self", "self_attention"),
-    ("attention/output/dense", "self_attention_output"),
-    ("attention/output/LayerNorm", "self_attention_layer_norm"),
-    ("intermediate/dense", "intermediate"),
-    ("output/dense", "output"),
-    ("output/LayerNorm", "output_layer_norm"),
-    ("pooler/dense", "pooler_transform"),
-)
-
-BERT_V2_NAME_REPLACEMENTS = (
-    ("bert/", ""),
-    ("encoder", "transformer"),
-    ("embeddings/word_embeddings", "word_embeddings/embeddings"),
-    ("embeddings/token_type_embeddings", "type_embeddings/embeddings"),
-    ("embeddings/position_embeddings", "position_embedding/embeddings"),
-    ("embeddings/LayerNorm", "embeddings/layer_norm"),
-    ("attention/self", "self_attention"),
-    ("attention/output/dense", "self_attention/attention_output"),
-    ("attention/output/LayerNorm", "self_attention_layer_norm"),
-    ("intermediate/dense", "intermediate"),
-    ("output/dense", "output"),
-    ("output/LayerNorm", "output_layer_norm"),
-    ("pooler/dense", "pooler_transform"),
-    ("cls/predictions", "bert/cls/predictions"),
-    ("cls/predictions/output_bias", "cls/predictions/output_bias/bias"),
-    ("cls/seq_relationship/output_bias", "predictions/transform/logits/bias"),
-    ("cls/seq_relationship/output_weights",
-     "predictions/transform/logits/kernel"),
-)
-
-BERT_PERMUTATIONS = ()
-
-BERT_V2_PERMUTATIONS = (("cls/seq_relationship/output_weights", (1, 0)),)
-
-
-def _bert_name_replacement(var_name, name_replacements):
-  """Gets the variable name replacement."""
-  for src_pattern, tgt_pattern in name_replacements:
-    if src_pattern in var_name:
-      old_var_name = var_name
-      var_name = var_name.replace(src_pattern, tgt_pattern)
-      tf.logging.info("Converted: %s --> %s", old_var_name, var_name)
-  return var_name
-
-
-def _has_exclude_patterns(name, exclude_patterns):
-  """Checks if a string contains substrings that match patterns to exclude."""
-  for p in exclude_patterns:
-    if p in name:
-      return True
-  return False
-
-
-def _get_permutation(name, permutations):
-  """Checks whether a variable requires transposition by pattern matching."""
-  for src_pattern, permutation in permutations:
-    if src_pattern in name:
-      tf.logging.info("Permuted: %s --> %s", name, permutation)
-      return permutation
-
-  return None
-
-
-def _get_new_shape(name, shape, num_heads):
-  """Checks whether a variable requires reshape by pattern matching."""
-  if "self_attention/attention_output/kernel" in name:
-    return tuple([num_heads, shape[0] // num_heads, shape[1]])
-  if "self_attention/attention_output/bias" in name:
-    return shape
-
-  patterns = [
-      "self_attention/query", "self_attention/value", "self_attention/key"
-  ]
-  for pattern in patterns:
-    if pattern in name:
-      if "kernel" in name:
-        return tuple([shape[0], num_heads, shape[1] // num_heads])
-      if "bias" in name:
-        return tuple([num_heads, shape[0] // num_heads])
-  return None
-
-
-def create_v2_checkpoint(model,
-                         src_checkpoint,
-                         output_path,
-                         checkpoint_model_name="model"):
-  """Converts a name-based matched TF V1 checkpoint to TF V2 checkpoint."""
-  # Uses streaming-restore in eager model to read V1 name-based checkpoints.
-  model.load_weights(src_checkpoint).assert_existing_objects_matched()
-  if hasattr(model, "checkpoint_items"):
-    checkpoint_items = model.checkpoint_items
-  else:
-    checkpoint_items = {}
-
-  checkpoint_items[checkpoint_model_name] = model
-  checkpoint = tf.train.Checkpoint(**checkpoint_items)
-  checkpoint.save(output_path)
-
-
-def convert(checkpoint_from_path,
-            checkpoint_to_path,
-            num_heads,
-            name_replacements,
-            permutations,
-            exclude_patterns=None):
-  """Migrates the names of variables within a checkpoint.
-
-  Args:
-    checkpoint_from_path: Path to source checkpoint to be read in.
-    checkpoint_to_path: Path to checkpoint to be written out.
-    num_heads: The number of heads of the model.
-    name_replacements: A list of tuples of the form (match_str, replace_str)
-      describing variable names to adjust.
-    permutations: A list of tuples of the form (match_str, permutation)
-      describing permutations to apply to given variables. Note that match_str
-      should match the original variable name, not the replaced one.
-    exclude_patterns: A list of string patterns to exclude variables from
-      checkpoint conversion.
-
-  Returns:
-    A dictionary that maps the new variable names to the Variable objects.
-    A dictionary that maps the old variable names to the new variable names.
-  """
-  with tf.Graph().as_default():
-    tf.logging.info("Reading checkpoint_from_path %s", checkpoint_from_path)
-    reader = tf.train.NewCheckpointReader(checkpoint_from_path)
-    name_shape_map = reader.get_variable_to_shape_map()
-    new_variable_map = {}
-    conversion_map = {}
-    for var_name in name_shape_map:
-      if exclude_patterns and _has_exclude_patterns(var_name, exclude_patterns):
-        continue
-      # Get the original tensor data.
-      tensor = reader.get_tensor(var_name)
-
-      # Look up the new variable name, if any.
-      new_var_name = _bert_name_replacement(var_name, name_replacements)
-
-      # See if we need to reshape the underlying tensor.
-      new_shape = None
-      if num_heads > 0:
-        new_shape = _get_new_shape(new_var_name, tensor.shape, num_heads)
-      if new_shape:
-        tf.logging.info("Veriable %s has a shape change from %s to %s",
-                        var_name, tensor.shape, new_shape)
-        tensor = np.reshape(tensor, new_shape)
-
-      # See if we need to permute the underlying tensor.
-      permutation = _get_permutation(var_name, permutations)
-      if permutation:
-        tensor = np.transpose(tensor, permutation)
-
-      # Create a new variable with the possibly-reshaped or transposed tensor.
-      var = tf.Variable(tensor, name=var_name)
-
-      # Save the variable into the new variable map.
-      new_variable_map[new_var_name] = var
-
-      # Keep a list of converter variables for sanity checking.
-      if new_var_name != var_name:
-        conversion_map[var_name] = new_var_name
-
-    saver = tf.train.Saver(new_variable_map)
-
-    with tf.Session() as sess:
-      sess.run(tf.global_variables_initializer())
-      tf.logging.info("Writing checkpoint_to_path %s", checkpoint_to_path)
-      saver.save(sess, checkpoint_to_path, write_meta_graph=False)
-
-  tf.logging.info("Summary:")
-  tf.logging.info("  Converted %d variable name(s).", len(new_variable_map))
-  tf.logging.info("  Converted: %s", str(conversion_map))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tf2_encoder_checkpoint_converter.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tf2_encoder_checkpoint_converter.py
deleted file mode 100644
index caec572d8a66888046596f27f02b5bd1d276d699..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tf2_encoder_checkpoint_converter.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A converter from a V1 BERT encoder checkpoint to a V2 encoder checkpoint.
-
-The conversion will yield an object-oriented checkpoint that can be used
-to restore a BertEncoder or BertPretrainerV2 object (see the `converted_model`
-FLAG below).
-"""
-
-import os
-
-from absl import app
-from absl import flags
-
-import tensorflow as tf
-from official.modeling import tf_utils
-from official.nlp.bert import configs
-from official.nlp.bert import tf1_checkpoint_converter_lib
-from official.nlp.modeling import models
-from official.nlp.modeling import networks
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("bert_config_file", None,
-                    "Bert configuration file to define core bert layers.")
-flags.DEFINE_string(
-    "checkpoint_to_convert", None,
-    "Initial checkpoint from a pretrained BERT model core (that is, only the "
-    "BertModel, with no task heads.)")
-flags.DEFINE_string("converted_checkpoint_path", None,
-                    "Name for the created object-based V2 checkpoint.")
-flags.DEFINE_string("checkpoint_model_name", "encoder",
-                    "The name of the model when saving the checkpoint, i.e., "
-                    "the checkpoint will be saved using: "
-                    "tf.train.Checkpoint(FLAGS.checkpoint_model_name=model).")
-flags.DEFINE_enum(
-    "converted_model", "encoder", ["encoder", "pretrainer"],
-    "Whether to convert the checkpoint to a `BertEncoder` model or a "
-    "`BertPretrainerV2` model (with mlm but without classification heads).")
-
-
-def _create_bert_model(cfg):
-  """Creates a BERT keras core model from BERT configuration.
-
-  Args:
-    cfg: A `BertConfig` to create the core model.
-
-  Returns:
-    A BertEncoder network.
-  """
-  bert_encoder = networks.BertEncoder(
-      vocab_size=cfg.vocab_size,
-      hidden_size=cfg.hidden_size,
-      num_layers=cfg.num_hidden_layers,
-      num_attention_heads=cfg.num_attention_heads,
-      intermediate_size=cfg.intermediate_size,
-      activation=tf_utils.get_activation(cfg.hidden_act),
-      dropout_rate=cfg.hidden_dropout_prob,
-      attention_dropout_rate=cfg.attention_probs_dropout_prob,
-      max_sequence_length=cfg.max_position_embeddings,
-      type_vocab_size=cfg.type_vocab_size,
-      initializer=tf.keras.initializers.TruncatedNormal(
-          stddev=cfg.initializer_range),
-      embedding_width=cfg.embedding_size)
-
-  return bert_encoder
-
-
-def _create_bert_pretrainer_model(cfg):
-  """Creates a BERT keras core model from BERT configuration.
-
-  Args:
-    cfg: A `BertConfig` to create the core model.
-
-  Returns:
-    A BertPretrainerV2 model.
-  """
-  bert_encoder = _create_bert_model(cfg)
-  pretrainer = models.BertPretrainerV2(
-      encoder_network=bert_encoder,
-      mlm_activation=tf_utils.get_activation(cfg.hidden_act),
-      mlm_initializer=tf.keras.initializers.TruncatedNormal(
-          stddev=cfg.initializer_range))
-  # Makes sure the pretrainer variables are created.
-  _ = pretrainer(pretrainer.inputs)
-  return pretrainer
-
-
-def convert_checkpoint(bert_config,
-                       output_path,
-                       v1_checkpoint,
-                       checkpoint_model_name="model",
-                       converted_model="encoder"):
-  """Converts a V1 checkpoint into an OO V2 checkpoint."""
-  output_dir, _ = os.path.split(output_path)
-  tf.io.gfile.makedirs(output_dir)
-
-  # Create a temporary V1 name-converted checkpoint in the output directory.
-  temporary_checkpoint_dir = os.path.join(output_dir, "temp_v1")
-  temporary_checkpoint = os.path.join(temporary_checkpoint_dir, "ckpt")
-
-  tf1_checkpoint_converter_lib.convert(
-      checkpoint_from_path=v1_checkpoint,
-      checkpoint_to_path=temporary_checkpoint,
-      num_heads=bert_config.num_attention_heads,
-      name_replacements=tf1_checkpoint_converter_lib.BERT_V2_NAME_REPLACEMENTS,
-      permutations=tf1_checkpoint_converter_lib.BERT_V2_PERMUTATIONS,
-      exclude_patterns=["adam", "Adam"])
-
-  if converted_model == "encoder":
-    model = _create_bert_model(bert_config)
-  elif converted_model == "pretrainer":
-    model = _create_bert_pretrainer_model(bert_config)
-  else:
-    raise ValueError("Unsupported converted_model: %s" % converted_model)
-
-  # Create a V2 checkpoint from the temporary checkpoint.
-  tf1_checkpoint_converter_lib.create_v2_checkpoint(model, temporary_checkpoint,
-                                                    output_path,
-                                                    checkpoint_model_name)
-
-  # Clean up the temporary checkpoint, if it exists.
-  try:
-    tf.io.gfile.rmtree(temporary_checkpoint_dir)
-  except tf.errors.OpError:
-    # If it doesn't exist, we don't need to clean it up; continue.
-    pass
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
-
-  output_path = FLAGS.converted_checkpoint_path
-  v1_checkpoint = FLAGS.checkpoint_to_convert
-  checkpoint_model_name = FLAGS.checkpoint_model_name
-  converted_model = FLAGS.converted_model
-  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-  convert_checkpoint(
-      bert_config=bert_config,
-      output_path=output_path,
-      v1_checkpoint=v1_checkpoint,
-      checkpoint_model_name=checkpoint_model_name,
-      converted_model=converted_model)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tokenization.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tokenization.py
deleted file mode 100644
index 6cc777ce8240a0f7ea7364cd72db7af42ebe2dc7..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tokenization.py
+++ /dev/null
@@ -1,557 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# coding=utf-8
-"""Tokenization classes implementation.
-
-The file is forked from:
-https://github.com/google-research/bert/blob/master/tokenization.py.
-"""
-
-import collections
-import re
-import unicodedata
-
-import six
-import tensorflow as tf
-
-import sentencepiece as spm
-
-SPIECE_UNDERLINE = "鈻"
-
-
-def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
-  """Checks whether the casing config is consistent with the checkpoint name."""
-
-  # The casing has to be passed in by the user and there is no explicit check
-  # as to whether it matches the checkpoint. The casing information probably
-  # should have been stored in the bert_config.json file, but it's not, so
-  # we have to heuristically detect it to validate.
-
-  if not init_checkpoint:
-    return
-
-  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
-  if m is None:
-    return
-
-  model_name = m.group(1)
-
-  lower_models = [
-      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
-      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
-  ]
-
-  cased_models = [
-      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
-      "multi_cased_L-12_H-768_A-12"
-  ]
-
-  is_bad_config = False
-  if model_name in lower_models and not do_lower_case:
-    is_bad_config = True
-    actual_flag = "False"
-    case_name = "lowercased"
-    opposite_flag = "True"
-
-  if model_name in cased_models and do_lower_case:
-    is_bad_config = True
-    actual_flag = "True"
-    case_name = "cased"
-    opposite_flag = "False"
-
-  if is_bad_config:
-    raise ValueError(
-        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
-        "However, `%s` seems to be a %s model, so you "
-        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
-        "how the model was pre-training. If this error is wrong, please "
-        "just comment out this check." %
-        (actual_flag, init_checkpoint, model_name, case_name, opposite_flag))
-
-
-def convert_to_unicode(text):
-  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-  if six.PY3:
-    if isinstance(text, str):
-      return text
-    elif isinstance(text, bytes):
-      return text.decode("utf-8", "ignore")
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  elif six.PY2:
-    if isinstance(text, str):
-      return text.decode("utf-8", "ignore")
-    elif isinstance(text, unicode):
-      return text
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  else:
-    raise ValueError("Not running on Python2 or Python 3?")
-
-
-def printable_text(text):
-  """Returns text encoded in a way suitable for print or `tf.logging`."""
-
-  # These functions want `str` for both Python2 and Python3, but in one case
-  # it's a Unicode string and in the other it's a byte string.
-  if six.PY3:
-    if isinstance(text, str):
-      return text
-    elif isinstance(text, bytes):
-      return text.decode("utf-8", "ignore")
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  elif six.PY2:
-    if isinstance(text, str):
-      return text
-    elif isinstance(text, unicode):
-      return text.encode("utf-8")
-    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
-  else:
-    raise ValueError("Not running on Python2 or Python 3?")
-
-
-def load_vocab(vocab_file):
-  """Loads a vocabulary file into a dictionary."""
-  vocab = collections.OrderedDict()
-  index = 0
-  with tf.io.gfile.GFile(vocab_file, "r") as reader:
-    while True:
-      token = convert_to_unicode(reader.readline())
-      if not token:
-        break
-      token = token.strip()
-      vocab[token] = index
-      index += 1
-  return vocab
-
-
-def convert_by_vocab(vocab, items):
-  """Converts a sequence of [tokens|ids] using the vocab."""
-  output = []
-  for item in items:
-    output.append(vocab[item])
-  return output
-
-
-def convert_tokens_to_ids(vocab, tokens):
-  return convert_by_vocab(vocab, tokens)
-
-
-def convert_ids_to_tokens(inv_vocab, ids):
-  return convert_by_vocab(inv_vocab, ids)
-
-
-def whitespace_tokenize(text):
-  """Runs basic whitespace cleaning and splitting on a piece of text."""
-  text = text.strip()
-  if not text:
-    return []
-  tokens = text.split()
-  return tokens
-
-
-class FullTokenizer(object):
-  """Runs end-to-end tokenziation."""
-
-  def __init__(self, vocab_file, do_lower_case=True, split_on_punc=True):
-    self.vocab = load_vocab(vocab_file)
-    self.inv_vocab = {v: k for k, v in self.vocab.items()}
-    self.basic_tokenizer = BasicTokenizer(
-        do_lower_case=do_lower_case, split_on_punc=split_on_punc)
-    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-
-  def tokenize(self, text):
-    split_tokens = []
-    for token in self.basic_tokenizer.tokenize(text):
-      for sub_token in self.wordpiece_tokenizer.tokenize(token):
-        split_tokens.append(sub_token)
-
-    return split_tokens
-
-  def convert_tokens_to_ids(self, tokens):
-    return convert_by_vocab(self.vocab, tokens)
-
-  def convert_ids_to_tokens(self, ids):
-    return convert_by_vocab(self.inv_vocab, ids)
-
-
-class BasicTokenizer(object):
-  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-  def __init__(self, do_lower_case=True, split_on_punc=True):
-    """Constructs a BasicTokenizer.
-
-    Args:
-      do_lower_case: Whether to lower case the input.
-      split_on_punc: Whether to apply split on punctuations. By default BERT
-        starts a new token for punctuations. This makes detokenization difficult
-        for tasks like seq2seq decoding.
-    """
-    self.do_lower_case = do_lower_case
-    self.split_on_punc = split_on_punc
-
-  def tokenize(self, text):
-    """Tokenizes a piece of text."""
-    text = convert_to_unicode(text)
-    text = self._clean_text(text)
-
-    # This was added on November 1st, 2018 for the multilingual and Chinese
-    # models. This is also applied to the English models now, but it doesn't
-    # matter since the English models were not trained on any Chinese data
-    # and generally don't have any Chinese data in them (there are Chinese
-    # characters in the vocabulary because Wikipedia does have some Chinese
-    # words in the English Wikipedia.).
-    text = self._tokenize_chinese_chars(text)
-
-    orig_tokens = whitespace_tokenize(text)
-    split_tokens = []
-    for token in orig_tokens:
-      if self.do_lower_case:
-        token = token.lower()
-        token = self._run_strip_accents(token)
-      if self.split_on_punc:
-        split_tokens.extend(self._run_split_on_punc(token))
-      else:
-        split_tokens.append(token)
-
-    output_tokens = whitespace_tokenize(" ".join(split_tokens))
-    return output_tokens
-
-  def _run_strip_accents(self, text):
-    """Strips accents from a piece of text."""
-    text = unicodedata.normalize("NFD", text)
-    output = []
-    for char in text:
-      cat = unicodedata.category(char)
-      if cat == "Mn":
-        continue
-      output.append(char)
-    return "".join(output)
-
-  def _run_split_on_punc(self, text):
-    """Splits punctuation on a piece of text."""
-    chars = list(text)
-    i = 0
-    start_new_word = True
-    output = []
-    while i < len(chars):
-      char = chars[i]
-      if _is_punctuation(char):
-        output.append([char])
-        start_new_word = True
-      else:
-        if start_new_word:
-          output.append([])
-        start_new_word = False
-        output[-1].append(char)
-      i += 1
-
-    return ["".join(x) for x in output]
-
-  def _tokenize_chinese_chars(self, text):
-    """Adds whitespace around any CJK character."""
-    output = []
-    for char in text:
-      cp = ord(char)
-      if self._is_chinese_char(cp):
-        output.append(" ")
-        output.append(char)
-        output.append(" ")
-      else:
-        output.append(char)
-    return "".join(output)
-
-  def _is_chinese_char(self, cp):
-    """Checks whether CP is the codepoint of a CJK character."""
-    # This defines a "chinese character" as anything in the CJK Unicode block:
-    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-    #
-    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-    # despite its name. The modern Korean Hangul alphabet is a different block,
-    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-    # space-separated words, so they are not treated specially and handled
-    # like the all of the other languages.
-    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-        (cp >= 0x3400 and cp <= 0x4DBF) or  #
-        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-        (cp >= 0x2B820 and cp <= 0x2CEAF) or
-        (cp >= 0xF900 and cp <= 0xFAFF) or  #
-        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-      return True
-
-    return False
-
-  def _clean_text(self, text):
-    """Performs invalid character removal and whitespace cleanup on text."""
-    output = []
-    for char in text:
-      cp = ord(char)
-      if cp == 0 or cp == 0xfffd or _is_control(char):
-        continue
-      if _is_whitespace(char):
-        output.append(" ")
-      else:
-        output.append(char)
-    return "".join(output)
-
-
-class WordpieceTokenizer(object):
-  """Runs WordPiece tokenziation."""
-
-  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=400):
-    self.vocab = vocab
-    self.unk_token = unk_token
-    self.max_input_chars_per_word = max_input_chars_per_word
-
-  def tokenize(self, text):
-    """Tokenizes a piece of text into its word pieces.
-
-    This uses a greedy longest-match-first algorithm to perform tokenization
-    using the given vocabulary.
-
-    For example:
-      input = "unaffable"
-      output = ["un", "##aff", "##able"]
-
-    Args:
-      text: A single token or whitespace separated tokens. This should have
-        already been passed through `BasicTokenizer.
-
-    Returns:
-      A list of wordpiece tokens.
-    """
-
-    text = convert_to_unicode(text)
-
-    output_tokens = []
-    for token in whitespace_tokenize(text):
-      chars = list(token)
-      if len(chars) > self.max_input_chars_per_word:
-        output_tokens.append(self.unk_token)
-        continue
-
-      is_bad = False
-      start = 0
-      sub_tokens = []
-      while start < len(chars):
-        end = len(chars)
-        cur_substr = None
-        while start < end:
-          substr = "".join(chars[start:end])
-          if start > 0:
-            substr = "##" + substr
-          if substr in self.vocab:
-            cur_substr = substr
-            break
-          end -= 1
-        if cur_substr is None:
-          is_bad = True
-          break
-        sub_tokens.append(cur_substr)
-        start = end
-
-      if is_bad:
-        output_tokens.append(self.unk_token)
-      else:
-        output_tokens.extend(sub_tokens)
-    return output_tokens
-
-
-def _is_whitespace(char):
-  """Checks whether `chars` is a whitespace character."""
-  # \t, \n, and \r are technically control characters but we treat them
-  # as whitespace since they are generally considered as such.
-  if char == " " or char == "\t" or char == "\n" or char == "\r":
-    return True
-  cat = unicodedata.category(char)
-  if cat == "Zs":
-    return True
-  return False
-
-
-def _is_control(char):
-  """Checks whether `chars` is a control character."""
-  # These are technically control characters but we count them as whitespace
-  # characters.
-  if char == "\t" or char == "\n" or char == "\r":
-    return False
-  cat = unicodedata.category(char)
-  if cat in ("Cc", "Cf"):
-    return True
-  return False
-
-
-def _is_punctuation(char):
-  """Checks whether `chars` is a punctuation character."""
-  cp = ord(char)
-  # We treat all non-letter/number ASCII as punctuation.
-  # Characters such as "^", "$", and "`" are not in the Unicode
-  # Punctuation class but we treat them as punctuation anyways, for
-  # consistency.
-  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-    return True
-  cat = unicodedata.category(char)
-  if cat.startswith("P"):
-    return True
-  return False
-
-
-def preprocess_text(inputs, remove_space=True, lower=False):
-  """Preprocesses data by removing extra space and normalize data.
-
-  This method is used together with sentence piece tokenizer and is forked from:
-  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
-
-  Args:
-    inputs: The input text.
-    remove_space: Whether to remove the extra space.
-    lower: Whether to lowercase the text.
-
-  Returns:
-    The preprocessed text.
-
-  """
-  outputs = inputs
-  if remove_space:
-    outputs = " ".join(inputs.strip().split())
-
-  if six.PY2 and isinstance(outputs, str):
-    try:
-      outputs = six.ensure_text(outputs, "utf-8")
-    except UnicodeDecodeError:
-      outputs = six.ensure_text(outputs, "latin-1")
-
-  outputs = unicodedata.normalize("NFKD", outputs)
-  outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
-  if lower:
-    outputs = outputs.lower()
-
-  return outputs
-
-
-def encode_pieces(sp_model, text, sample=False):
-  """Segements text into pieces.
-
-  This method is used together with sentence piece tokenizer and is forked from:
-  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
-
-
-  Args:
-    sp_model: A spm.SentencePieceProcessor object.
-    text: The input text to be segemented.
-    sample: Whether to randomly sample a segmentation output or return a
-      deterministic one.
-
-  Returns:
-    A list of token pieces.
-  """
-  if six.PY2 and isinstance(text, six.text_type):
-    text = six.ensure_binary(text, "utf-8")
-
-  if not sample:
-    pieces = sp_model.EncodeAsPieces(text)
-  else:
-    pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
-  new_pieces = []
-  for piece in pieces:
-    piece = printable_text(piece)
-    if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
-      cur_pieces = sp_model.EncodeAsPieces(piece[:-1].replace(
-          SPIECE_UNDERLINE, ""))
-      if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
-        if len(cur_pieces[0]) == 1:
-          cur_pieces = cur_pieces[1:]
-        else:
-          cur_pieces[0] = cur_pieces[0][1:]
-      cur_pieces.append(piece[-1])
-      new_pieces.extend(cur_pieces)
-    else:
-      new_pieces.append(piece)
-
-  return new_pieces
-
-
-def encode_ids(sp_model, text, sample=False):
-  """Segments text and return token ids.
-
-  This method is used together with sentence piece tokenizer and is forked from:
-  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
-
-  Args:
-    sp_model: A spm.SentencePieceProcessor object.
-    text: The input text to be segemented.
-    sample: Whether to randomly sample a segmentation output or return a
-      deterministic one.
-
-  Returns:
-    A list of token ids.
-  """
-  pieces = encode_pieces(sp_model, text, sample=sample)
-  ids = [sp_model.PieceToId(piece) for piece in pieces]
-  return ids
-
-
-class FullSentencePieceTokenizer(object):
-  """Runs end-to-end sentence piece tokenization.
-
-  The interface of this class is intended to keep the same as above
-  `FullTokenizer` class for easier usage.
-  """
-
-  def __init__(self, sp_model_file):
-    """Inits FullSentencePieceTokenizer.
-
-    Args:
-      sp_model_file: The path to the sentence piece model file.
-    """
-    self.sp_model = spm.SentencePieceProcessor()
-    self.sp_model.Load(sp_model_file)
-    self.vocab = {
-        self.sp_model.IdToPiece(i): i
-        for i in six.moves.range(self.sp_model.GetPieceSize())
-    }
-
-  def tokenize(self, text):
-    """Tokenizes text into pieces."""
-    return encode_pieces(self.sp_model, text)
-
-  def convert_tokens_to_ids(self, tokens):
-    """Converts a list of tokens to a list of ids."""
-    return [self.sp_model.PieceToId(printable_text(token)) for token in tokens]
-
-  def convert_ids_to_tokens(self, ids):
-    """Converts a list of ids ot a list of tokens."""
-    return [self.sp_model.IdToPiece(id_) for id_ in ids]
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tokenization_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tokenization_test.py
deleted file mode 100644
index 43fdf9854c077d592dd56cbe52ae8b57be4d0add..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/tokenization_test.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import os
-import tempfile
-
-import six
-import tensorflow as tf
-
-from official.nlp.bert import tokenization
-
-
-class TokenizationTest(tf.test.TestCase):
-  """Tokenization test.
-
-    The implementation is forked from
-    https://github.com/google-research/bert/blob/master/tokenization_test.py."
-  """
-
-  def test_full_tokenizer(self):
-    vocab_tokens = [
-        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-        "##ing", ","
-    ]
-    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
-      if six.PY2:
-        vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-      else:
-        vocab_writer.write("".join([x + "\n" for x in vocab_tokens
-                                   ]).encode("utf-8"))
-
-      vocab_file = vocab_writer.name
-
-    tokenizer = tokenization.FullTokenizer(vocab_file)
-    os.unlink(vocab_file)
-
-    tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
-    self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-
-    self.assertAllEqual(
-        tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
-
-  def test_chinese(self):
-    tokenizer = tokenization.BasicTokenizer()
-
-    self.assertAllEqual(
-        tokenizer.tokenize(u"ah\u535A\u63A8zz"),
-        [u"ah", u"\u535A", u"\u63A8", u"zz"])
-
-  def test_basic_tokenizer_lower(self):
-    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
-
-    self.assertAllEqual(
-        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
-        ["hello", "!", "how", "are", "you", "?"])
-    self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
-
-  def test_basic_tokenizer_no_lower(self):
-    tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
-
-    self.assertAllEqual(
-        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
-        ["HeLLo", "!", "how", "Are", "yoU", "?"])
-
-  def test_basic_tokenizer_no_split_on_punc(self):
-    tokenizer = tokenization.BasicTokenizer(
-        do_lower_case=True, split_on_punc=False)
-
-    self.assertAllEqual(
-        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
-        ["hello!how", "are", "you?"])
-
-  def test_wordpiece_tokenizer(self):
-    vocab_tokens = [
-        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-        "##ing", "##!", "!"
-    ]
-
-    vocab = {}
-    for (i, token) in enumerate(vocab_tokens):
-      vocab[token] = i
-    tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
-
-    self.assertAllEqual(tokenizer.tokenize(""), [])
-
-    self.assertAllEqual(
-        tokenizer.tokenize("unwanted running"),
-        ["un", "##want", "##ed", "runn", "##ing"])
-
-    self.assertAllEqual(
-        tokenizer.tokenize("unwanted running !"),
-        ["un", "##want", "##ed", "runn", "##ing", "!"])
-
-    self.assertAllEqual(
-        tokenizer.tokenize("unwanted running!"),
-        ["un", "##want", "##ed", "runn", "##ing", "##!"])
-
-    self.assertAllEqual(
-        tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-  def test_convert_tokens_to_ids(self):
-    vocab_tokens = [
-        "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-        "##ing"
-    ]
-
-    vocab = {}
-    for (i, token) in enumerate(vocab_tokens):
-      vocab[token] = i
-
-    self.assertAllEqual(
-        tokenization.convert_tokens_to_ids(
-            vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9])
-
-  def test_is_whitespace(self):
-    self.assertTrue(tokenization._is_whitespace(u" "))
-    self.assertTrue(tokenization._is_whitespace(u"\t"))
-    self.assertTrue(tokenization._is_whitespace(u"\r"))
-    self.assertTrue(tokenization._is_whitespace(u"\n"))
-    self.assertTrue(tokenization._is_whitespace(u"\u00A0"))
-
-    self.assertFalse(tokenization._is_whitespace(u"A"))
-    self.assertFalse(tokenization._is_whitespace(u"-"))
-
-  def test_is_control(self):
-    self.assertTrue(tokenization._is_control(u"\u0005"))
-
-    self.assertFalse(tokenization._is_control(u"A"))
-    self.assertFalse(tokenization._is_control(u" "))
-    self.assertFalse(tokenization._is_control(u"\t"))
-    self.assertFalse(tokenization._is_control(u"\r"))
-    self.assertFalse(tokenization._is_control(u"\U0001F4A9"))
-
-  def test_is_punctuation(self):
-    self.assertTrue(tokenization._is_punctuation(u"-"))
-    self.assertTrue(tokenization._is_punctuation(u"$"))
-    self.assertTrue(tokenization._is_punctuation(u"`"))
-    self.assertTrue(tokenization._is_punctuation(u"."))
-
-    self.assertFalse(tokenization._is_punctuation(u"A"))
-    self.assertFalse(tokenization._is_punctuation(u" "))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/bert.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/bert.py
deleted file mode 100644
index 0dc63ca03eae0bb74570194a794bbbcf777e855d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/bert.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Multi-head BERT encoder network with classification heads.
-
-Includes configurations and instantiation methods.
-"""
-from typing import List, Optional, Text
-
-import dataclasses
-
-from official.modeling.hyperparams import base_config
-from official.nlp.configs import encoders
-
-
-@dataclasses.dataclass
-class ClsHeadConfig(base_config.Config):
-  inner_dim: int = 0
-  num_classes: int = 2
-  activation: Optional[Text] = "tanh"
-  dropout_rate: float = 0.0
-  cls_token_idx: int = 0
-  name: Optional[Text] = None
-
-
-@dataclasses.dataclass
-class PretrainerConfig(base_config.Config):
-  """Pretrainer configuration."""
-  encoder: encoders.EncoderConfig = encoders.EncoderConfig()
-  cls_heads: List[ClsHeadConfig] = dataclasses.field(default_factory=list)
-  mlm_activation: str = "gelu"
-  mlm_initializer_range: float = 0.02
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/electra.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/electra.py
deleted file mode 100644
index 10dd5fa08780c8fc208a99a0821f1d93429475f6..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/electra.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""ELECTRA model configurations and instantiation methods."""
-from typing import List
-
-import dataclasses
-
-from official.modeling.hyperparams import base_config
-from official.nlp.configs import bert
-from official.nlp.configs import encoders
-
-
-@dataclasses.dataclass
-class ElectraPretrainerConfig(base_config.Config):
-  """ELECTRA pretrainer configuration."""
-  num_masked_tokens: int = 76
-  sequence_length: int = 512
-  num_classes: int = 2
-  discriminator_loss_weight: float = 50.0
-  tie_embeddings: bool = True
-  disallow_correct: bool = False
-  generator_encoder: encoders.EncoderConfig = encoders.EncoderConfig()
-  discriminator_encoder: encoders.EncoderConfig = encoders.EncoderConfig()
-  cls_heads: List[bert.ClsHeadConfig] = dataclasses.field(default_factory=list)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/encoders.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/encoders.py
deleted file mode 100644
index f4d759c908363e7ad07038e25fe9bbeb86c0d1be..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/encoders.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Transformer Encoders.
-
-Includes configurations and factory methods.
-"""
-from typing import Optional
-
-import dataclasses
-import gin
-import tensorflow as tf
-
-from official.modeling import hyperparams
-from official.modeling import tf_utils
-from official.nlp.modeling import layers
-from official.nlp.modeling import networks
-
-
-@dataclasses.dataclass
-class BertEncoderConfig(hyperparams.Config):
-  """BERT encoder configuration."""
-  vocab_size: int = 30522
-  hidden_size: int = 768
-  num_layers: int = 12
-  num_attention_heads: int = 12
-  hidden_activation: str = "gelu"
-  intermediate_size: int = 3072
-  dropout_rate: float = 0.1
-  attention_dropout_rate: float = 0.1
-  max_position_embeddings: int = 512
-  type_vocab_size: int = 2
-  initializer_range: float = 0.02
-  embedding_size: Optional[int] = None
-  output_range: Optional[int] = None
-  return_all_encoder_outputs: bool = False
-
-
-@dataclasses.dataclass
-class MobileBertEncoderConfig(hyperparams.Config):
-  """MobileBERT encoder configuration.
-
-  Attributes:
-    word_vocab_size: number of words in the vocabulary.
-    word_embed_size: word embedding size.
-    type_vocab_size: number of word types.
-    max_sequence_length: maximum length of input sequence.
-    num_blocks: number of transformer block in the encoder model.
-    hidden_size: the hidden size for the transformer block.
-    num_attention_heads: number of attention heads in the transformer block.
-    intermediate_size: the size of the "intermediate" (a.k.a., feed forward)
-      layer.
-    hidden_activation: the non-linear activation function to apply to the
-      output of the intermediate/feed-forward layer.
-    hidden_dropout_prob: dropout probability for the hidden layers.
-    attention_probs_dropout_prob: dropout probability of the attention
-      probabilities.
-    intra_bottleneck_size: the size of bottleneck.
-    initializer_range: The stddev of the truncated_normal_initializer for
-      initializing all weight matrices.
-    use_bottleneck_attention: Use attention inputs from the bottleneck
-      transformation. If true, the following `key_query_shared_bottleneck`
-      will be ignored.
-    key_query_shared_bottleneck: whether to share linear transformation for keys
-      and queries.
-    num_feedforward_networks: number of stacked feed-forward networks.
-    normalization_type: the type of normalization_type, only 'no_norm' and
-      'layer_norm' are supported. 'no_norm' represents the element-wise linear
-      transformation for the student model, as suggested by the original
-      MobileBERT paper. 'layer_norm' is used for the teacher model.
-    classifier_activation: if using the tanh activation for the final
-      representation of the [CLS] token in fine-tuning.
-  """
-  word_vocab_size: int = 30522
-  word_embed_size: int = 128
-  type_vocab_size: int = 2
-  max_sequence_length: int = 512
-  num_blocks: int = 24
-  hidden_size: int = 512
-  num_attention_heads: int = 4
-  intermediate_size: int = 4096
-  hidden_activation: str = "gelu"
-  hidden_dropout_prob: float = 0.1
-  attention_probs_dropout_prob: float = 0.1
-  intra_bottleneck_size: int = 1024
-  initializer_range: float = 0.02
-  use_bottleneck_attention: bool = False
-  key_query_shared_bottleneck: bool = False
-  num_feedforward_networks: int = 1
-  normalization_type: str = "layer_norm"
-  classifier_activation: bool = True
-  input_mask_dtype: str = "int32"
-
-
-@dataclasses.dataclass
-class AlbertEncoderConfig(hyperparams.Config):
-  """ALBERT encoder configuration."""
-  vocab_size: int = 30000
-  embedding_width: int = 128
-  hidden_size: int = 768
-  num_layers: int = 12
-  num_attention_heads: int = 12
-  hidden_activation: str = "gelu"
-  intermediate_size: int = 3072
-  dropout_rate: float = 0.0
-  attention_dropout_rate: float = 0.0
-  max_position_embeddings: int = 512
-  type_vocab_size: int = 2
-  initializer_range: float = 0.02
-
-
-@dataclasses.dataclass
-class BigBirdEncoderConfig(hyperparams.Config):
-  """BigBird encoder configuration."""
-  vocab_size: int = 50358
-  hidden_size: int = 768
-  num_layers: int = 12
-  num_attention_heads: int = 12
-  hidden_activation: str = "gelu"
-  intermediate_size: int = 3072
-  dropout_rate: float = 0.1
-  attention_dropout_rate: float = 0.1
-  max_position_embeddings: int = 4096
-  num_rand_blocks: int = 3
-  block_size: int = 64
-  type_vocab_size: int = 16
-  initializer_range: float = 0.02
-  embedding_width: Optional[int] = None
-  use_gradient_checkpointing: bool = False
-
-
-@dataclasses.dataclass
-class KernelEncoderConfig(hyperparams.Config):
-  """Linear encoder configuration."""
-  vocab_size: int = 30522
-  hidden_size: int = 768
-  num_layers: int = 12
-  num_attention_heads: int = 12
-  hidden_activation: str = "gelu"
-  intermediate_size: int = 3072
-  dropout_rate: float = 0.1
-  attention_dropout_rate: float = 0.1
-  max_position_embeddings: int = 512
-  type_vocab_size: int = 2
-  initializer_range: float = 0.02
-  embedding_size: Optional[int] = None
-  feature_transform: str = "exp"
-  num_random_features: int = 256
-  redraw: bool = False
-  is_short_seq: bool = False
-  begin_kernel: int = 0
-
-
-@dataclasses.dataclass
-class XLNetEncoderConfig(hyperparams.Config):
-  """XLNet encoder configuration."""
-  vocab_size: int = 32000
-  num_layers: int = 24
-  hidden_size: int = 1024
-  num_attention_heads: int = 16
-  head_size: int = 64
-  inner_size: int = 4096
-  inner_activation: str = "gelu"
-  dropout_rate: float = 0.1
-  attention_dropout_rate: float = 0.1
-  attention_type: str = "bi"
-  bi_data: bool = False
-  tie_attention_biases: bool = False
-  memory_length: int = 0
-  same_length: bool = False
-  clamp_length: int = -1
-  reuse_length: int = 0
-  use_cls_mask: bool = False
-  embedding_width: int = 1024
-  initializer_range: float = 0.02
-  two_stream: bool = False
-
-
-@dataclasses.dataclass
-class EncoderConfig(hyperparams.OneOfConfig):
-  """Encoder configuration."""
-  type: Optional[str] = "bert"
-  albert: AlbertEncoderConfig = AlbertEncoderConfig()
-  bert: BertEncoderConfig = BertEncoderConfig()
-  bigbird: BigBirdEncoderConfig = BigBirdEncoderConfig()
-  kernel: KernelEncoderConfig = KernelEncoderConfig()
-  mobilebert: MobileBertEncoderConfig = MobileBertEncoderConfig()
-  xlnet: XLNetEncoderConfig = XLNetEncoderConfig()
-
-
-@gin.configurable
-def build_encoder(config: EncoderConfig,
-                  embedding_layer: Optional[tf.keras.layers.Layer] = None,
-                  encoder_cls=None,
-                  bypass_config: bool = False):
-  """Instantiate a Transformer encoder network from EncoderConfig.
-
-  Args:
-    config: the one-of encoder config, which provides encoder parameters of a
-      chosen encoder.
-    embedding_layer: an external embedding layer passed to the encoder.
-    encoder_cls: an external encoder cls not included in the supported encoders,
-      usually used by gin.configurable.
-    bypass_config: whether to ignore config instance to create the object with
-      `encoder_cls`.
-
-  Returns:
-    An encoder instance.
-  """
-  if bypass_config:
-    return encoder_cls()
-  encoder_type = config.type
-  encoder_cfg = config.get()
-  if encoder_cls and encoder_cls.__name__ == "EncoderScaffold":
-    embedding_cfg = dict(
-        vocab_size=encoder_cfg.vocab_size,
-        type_vocab_size=encoder_cfg.type_vocab_size,
-        hidden_size=encoder_cfg.hidden_size,
-        max_seq_length=encoder_cfg.max_position_embeddings,
-        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        dropout_rate=encoder_cfg.dropout_rate,
-    )
-    hidden_cfg = dict(
-        num_attention_heads=encoder_cfg.num_attention_heads,
-        intermediate_size=encoder_cfg.intermediate_size,
-        intermediate_activation=tf_utils.get_activation(
-            encoder_cfg.hidden_activation),
-        dropout_rate=encoder_cfg.dropout_rate,
-        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-    )
-    kwargs = dict(
-        embedding_cfg=embedding_cfg,
-        hidden_cfg=hidden_cfg,
-        num_hidden_instances=encoder_cfg.num_layers,
-        pooled_output_dim=encoder_cfg.hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        return_all_layer_outputs=encoder_cfg.return_all_encoder_outputs,
-        dict_outputs=True)
-    return encoder_cls(**kwargs)
-
-  if encoder_type == "mobilebert":
-    return networks.MobileBERTEncoder(
-        word_vocab_size=encoder_cfg.word_vocab_size,
-        word_embed_size=encoder_cfg.word_embed_size,
-        type_vocab_size=encoder_cfg.type_vocab_size,
-        max_sequence_length=encoder_cfg.max_sequence_length,
-        num_blocks=encoder_cfg.num_blocks,
-        hidden_size=encoder_cfg.hidden_size,
-        num_attention_heads=encoder_cfg.num_attention_heads,
-        intermediate_size=encoder_cfg.intermediate_size,
-        intermediate_act_fn=encoder_cfg.hidden_activation,
-        hidden_dropout_prob=encoder_cfg.hidden_dropout_prob,
-        attention_probs_dropout_prob=encoder_cfg.attention_probs_dropout_prob,
-        intra_bottleneck_size=encoder_cfg.intra_bottleneck_size,
-        initializer_range=encoder_cfg.initializer_range,
-        use_bottleneck_attention=encoder_cfg.use_bottleneck_attention,
-        key_query_shared_bottleneck=encoder_cfg.key_query_shared_bottleneck,
-        num_feedforward_networks=encoder_cfg.num_feedforward_networks,
-        normalization_type=encoder_cfg.normalization_type,
-        classifier_activation=encoder_cfg.classifier_activation,
-        input_mask_dtype=encoder_cfg.input_mask_dtype)
-
-  if encoder_type == "albert":
-    return networks.AlbertEncoder(
-        vocab_size=encoder_cfg.vocab_size,
-        embedding_width=encoder_cfg.embedding_width,
-        hidden_size=encoder_cfg.hidden_size,
-        num_layers=encoder_cfg.num_layers,
-        num_attention_heads=encoder_cfg.num_attention_heads,
-        max_sequence_length=encoder_cfg.max_position_embeddings,
-        type_vocab_size=encoder_cfg.type_vocab_size,
-        intermediate_size=encoder_cfg.intermediate_size,
-        activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
-        dropout_rate=encoder_cfg.dropout_rate,
-        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
-        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        dict_outputs=True)
-
-  if encoder_type == "bigbird":
-    # TODO(frederickliu): Support use_gradient_checkpointing.
-    if encoder_cfg.use_gradient_checkpointing:
-      raise ValueError("Gradient checkpointing unsupported at the moment.")
-    embedding_cfg = dict(
-        vocab_size=encoder_cfg.vocab_size,
-        type_vocab_size=encoder_cfg.type_vocab_size,
-        hidden_size=encoder_cfg.hidden_size,
-        max_seq_length=encoder_cfg.max_position_embeddings,
-        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        dropout_rate=encoder_cfg.dropout_rate)
-    attention_cfg = dict(
-        num_heads=encoder_cfg.num_attention_heads,
-        key_dim=int(encoder_cfg.hidden_size // encoder_cfg.num_attention_heads),
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        max_rand_mask_length=encoder_cfg.max_position_embeddings,
-        num_rand_blocks=encoder_cfg.num_rand_blocks,
-        from_block_size=encoder_cfg.block_size,
-        to_block_size=encoder_cfg.block_size,
-        )
-    hidden_cfg = dict(
-        num_attention_heads=encoder_cfg.num_attention_heads,
-        intermediate_size=encoder_cfg.intermediate_size,
-        intermediate_activation=tf_utils.get_activation(
-            encoder_cfg.hidden_activation),
-        dropout_rate=encoder_cfg.dropout_rate,
-        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        attention_cls=layers.BigBirdAttention,
-        attention_cfg=attention_cfg)
-    kwargs = dict(
-        embedding_cfg=embedding_cfg,
-        hidden_cls=layers.TransformerScaffold,
-        hidden_cfg=hidden_cfg,
-        num_hidden_instances=encoder_cfg.num_layers,
-        mask_cls=layers.BigBirdMasks,
-        mask_cfg=dict(block_size=encoder_cfg.block_size),
-        pooled_output_dim=encoder_cfg.hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        return_all_layer_outputs=False,
-        dict_outputs=True,
-        layer_idx_as_attention_seed=True)
-    return networks.EncoderScaffold(**kwargs)
-
-  if encoder_type == "kernel":
-    embedding_cfg = dict(
-        vocab_size=encoder_cfg.vocab_size,
-        type_vocab_size=encoder_cfg.type_vocab_size,
-        hidden_size=encoder_cfg.hidden_size,
-        max_seq_length=encoder_cfg.max_position_embeddings,
-        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        dropout_rate=encoder_cfg.dropout_rate)
-    attention_cfg = dict(
-        num_heads=encoder_cfg.num_attention_heads,
-        key_dim=int(encoder_cfg.hidden_size // encoder_cfg.num_attention_heads),
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        feature_transform=encoder_cfg.feature_transform,
-        num_random_features=encoder_cfg.num_random_features,
-        redraw=encoder_cfg.redraw,
-        is_short_seq=encoder_cfg.is_short_seq,
-        begin_kernel=encoder_cfg.begin_kernel,
-        )
-    hidden_cfg = dict(
-        num_attention_heads=encoder_cfg.num_attention_heads,
-        intermediate_size=encoder_cfg.intermediate_size,
-        intermediate_activation=tf_utils.get_activation(
-            encoder_cfg.hidden_activation),
-        dropout_rate=encoder_cfg.dropout_rate,
-        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        attention_cls=layers.KernelAttention,
-        attention_cfg=attention_cfg)
-    kwargs = dict(
-        embedding_cfg=embedding_cfg,
-        hidden_cls=layers.TransformerScaffold,
-        hidden_cfg=hidden_cfg,
-        num_hidden_instances=encoder_cfg.num_layers,
-        mask_cls=layers.KernelMask,
-        pooled_output_dim=encoder_cfg.hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range),
-        return_all_layer_outputs=False,
-        dict_outputs=True,
-        layer_idx_as_attention_seed=True)
-    return networks.EncoderScaffold(**kwargs)
-
-  if encoder_type == "xlnet":
-    return networks.XLNetBase(
-        vocab_size=encoder_cfg.vocab_size,
-        num_layers=encoder_cfg.num_layers,
-        hidden_size=encoder_cfg.hidden_size,
-        num_attention_heads=encoder_cfg.num_attention_heads,
-        head_size=encoder_cfg.head_size,
-        inner_size=encoder_cfg.inner_size,
-        dropout_rate=encoder_cfg.dropout_rate,
-        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
-        attention_type=encoder_cfg.attention_type,
-        bi_data=encoder_cfg.bi_data,
-        two_stream=encoder_cfg.two_stream,
-        tie_attention_biases=encoder_cfg.tie_attention_biases,
-        memory_length=encoder_cfg.memory_length,
-        clamp_length=encoder_cfg.clamp_length,
-        reuse_length=encoder_cfg.reuse_length,
-        inner_activation=encoder_cfg.inner_activation,
-        use_cls_mask=encoder_cfg.use_cls_mask,
-        embedding_width=encoder_cfg.embedding_width,
-        initializer=tf.keras.initializers.RandomNormal(
-            stddev=encoder_cfg.initializer_range))
-
-  # Uses the default BERTEncoder configuration schema to create the encoder.
-  # If it does not match, please add a switch branch by the encoder type.
-  return networks.BertEncoder(
-      vocab_size=encoder_cfg.vocab_size,
-      hidden_size=encoder_cfg.hidden_size,
-      num_layers=encoder_cfg.num_layers,
-      num_attention_heads=encoder_cfg.num_attention_heads,
-      intermediate_size=encoder_cfg.intermediate_size,
-      activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
-      dropout_rate=encoder_cfg.dropout_rate,
-      attention_dropout_rate=encoder_cfg.attention_dropout_rate,
-      max_sequence_length=encoder_cfg.max_position_embeddings,
-      type_vocab_size=encoder_cfg.type_vocab_size,
-      initializer=tf.keras.initializers.TruncatedNormal(
-          stddev=encoder_cfg.initializer_range),
-      output_range=encoder_cfg.output_range,
-      embedding_width=encoder_cfg.embedding_size,
-      embedding_layer=embedding_layer,
-      return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs,
-      dict_outputs=True)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/encoders_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/encoders_test.py
deleted file mode 100644
index 89ba49c4474ee922a6bdf14e7bf4d0290f06621a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/encoders_test.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.configs.encoders."""
-import os
-
-import tensorflow as tf
-
-from official.modeling import hyperparams
-from official.nlp.configs import encoders
-
-
-class EncodersTest(tf.test.TestCase):
-
-  def test_encoder_from_yaml(self):
-    config = encoders.EncoderConfig(
-        type="bert", bert=encoders.BertEncoderConfig(num_layers=1))
-    encoder = encoders.build_encoder(config)
-    ckpt = tf.train.Checkpoint(encoder=encoder)
-    ckpt_path = ckpt.save(self.get_temp_dir() + "/ckpt")
-    params_save_path = os.path.join(self.get_temp_dir(), "params.yaml")
-    hyperparams.save_params_dict_to_yaml(config, params_save_path)
-
-    retored_cfg = encoders.EncoderConfig.from_yaml(params_save_path)
-    retored_encoder = encoders.build_encoder(retored_cfg)
-    status = tf.train.Checkpoint(encoder=retored_encoder).restore(ckpt_path)
-    status.assert_consumed()
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/experiment_configs.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/experiment_configs.py
deleted file mode 100644
index 1185d17d329b5011bdf730c4c5d14360cb2e29e0..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/experiment_configs.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Experiments definition."""
-# pylint: disable=unused-import
-from official.nlp.configs import finetuning_experiments
-from official.nlp.configs import pretraining_experiments
-from official.nlp.configs import wmt_transformer_experiments
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/experiments/glue_mnli_matched.yaml b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/experiments/glue_mnli_matched.yaml
deleted file mode 100644
index 29dfcb68b9c314d309239c321dde4ec4f439da1d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/experiments/glue_mnli_matched.yaml
+++ /dev/null
@@ -1,49 +0,0 @@
-task:
-  hub_module_url: ''
-  model:
-    num_classes: 3
-  init_checkpoint: ''
-  metric_type: 'accuracy'
-  train_data:
-    drop_remainder: true
-    global_batch_size: 32
-    input_path: ''
-    is_training: true
-    seq_length: 128
-    label_type: 'int'
-  validation_data:
-    drop_remainder: false
-    global_batch_size: 32
-    input_path: ''
-    is_training: false
-    seq_length: 128
-    label_type: 'int'
-trainer:
-  checkpoint_interval: 3000
-  optimizer_config:
-    learning_rate:
-      polynomial:
-        # 100% of train_steps.
-        decay_steps: 36813
-        end_learning_rate: 0.0
-        initial_learning_rate: 3.0e-05
-        power: 1.0
-      type: polynomial
-    optimizer:
-      type: adamw
-    warmup:
-      polynomial:
-        power: 1
-        # ~10% of train_steps.
-        warmup_steps: 3681
-      type: polynomial
-  steps_per_loop: 1000
-  summary_interval: 1000
-  # Training data size 392,702 examples, 3 epochs.
-  train_steps: 36813
-  validation_interval: 6135
-  # Eval data size = 9815 examples.
-  validation_steps: 307
-  best_checkpoint_export_subdir: 'best_ckpt'
-  best_checkpoint_eval_metric: 'cls_accuracy'
-  best_checkpoint_metric_comp: 'higher'
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/experiments/squad_v1.yaml b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/experiments/squad_v1.yaml
deleted file mode 100644
index a69710a58f7dfa4e044bceb73c5870701ca39189..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/experiments/squad_v1.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-task:
-  hub_module_url: ''
-  max_answer_length: 30
-  n_best_size: 20
-  null_score_diff_threshold: 0.0
-  init_checkpoint: ''
-  train_data:
-    drop_remainder: true
-    global_batch_size: 48
-    input_path: ''
-    is_training: true
-    seq_length: 384
-  validation_data:
-    do_lower_case: true
-    doc_stride: 128
-    drop_remainder: false
-    global_batch_size: 48
-    input_path: ''
-    is_training: false
-    query_length: 64
-    seq_length: 384
-    tokenization: WordPiece
-    version_2_with_negative: false
-    vocab_file: ''
-trainer:
-  checkpoint_interval: 1000
-  max_to_keep: 5
-  optimizer_config:
-    learning_rate:
-      polynomial:
-        decay_steps: 3699
-        end_learning_rate: 0.0
-        initial_learning_rate: 8.0e-05
-        power: 1.0
-      type: polynomial
-    optimizer:
-      type: adamw
-    warmup:
-      polynomial:
-        power: 1
-        warmup_steps: 370
-      type: polynomial
-  steps_per_loop: 1000
-  summary_interval: 1000
-  train_steps: 3699
-  validation_interval: 1000
-  validation_steps: 226
-  best_checkpoint_export_subdir: 'best_ckpt'
-  best_checkpoint_eval_metric: 'final_f1'
-  best_checkpoint_metric_comp: 'higher'
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/finetuning_experiments.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/finetuning_experiments.py
deleted file mode 100644
index 6aef1fcc5f12d3aaf95ce2d072f969e3c445df4f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/finetuning_experiments.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Finetuning experiment configurations."""
-# pylint: disable=g-doc-return-or-yield,line-too-long
-from official.core import config_definitions as cfg
-from official.core import exp_factory
-from official.modeling import optimization
-from official.nlp.data import question_answering_dataloader
-from official.nlp.data import sentence_prediction_dataloader
-from official.nlp.data import tagging_dataloader
-from official.nlp.tasks import question_answering
-from official.nlp.tasks import sentence_prediction
-from official.nlp.tasks import tagging
-
-
-@exp_factory.register_config_factory('bert/sentence_prediction')
-def bert_sentence_prediction() -> cfg.ExperimentConfig:
-  r"""BERT GLUE."""
-  config = cfg.ExperimentConfig(
-      task=sentence_prediction.SentencePredictionConfig(
-          train_data=sentence_prediction_dataloader
-          .SentencePredictionDataConfig(),
-          validation_data=sentence_prediction_dataloader
-          .SentencePredictionDataConfig(
-              is_training=False, drop_remainder=False)),
-      trainer=cfg.TrainerConfig(
-          optimizer_config=optimization.OptimizationConfig({
-              'optimizer': {
-                  'type': 'adamw',
-                  'adamw': {
-                      'weight_decay_rate':
-                          0.01,
-                      'exclude_from_weight_decay':
-                          ['LayerNorm', 'layer_norm', 'bias'],
-                  }
-              },
-              'learning_rate': {
-                  'type': 'polynomial',
-                  'polynomial': {
-                      'initial_learning_rate': 3e-5,
-                      'end_learning_rate': 0.0,
-                  }
-              },
-              'warmup': {
-                  'type': 'polynomial'
-              }
-          })),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None'
-      ])
-  config.task.model.encoder.type = 'bert'
-  return config
-
-
-@exp_factory.register_config_factory('bert/squad')
-def bert_squad() -> cfg.ExperimentConfig:
-  """BERT Squad V1/V2."""
-  config = cfg.ExperimentConfig(
-      task=question_answering.QuestionAnsweringConfig(
-          train_data=question_answering_dataloader.QADataConfig(),
-          validation_data=question_answering_dataloader.QADataConfig()),
-      trainer=cfg.TrainerConfig(
-          optimizer_config=optimization.OptimizationConfig({
-              'optimizer': {
-                  'type': 'adamw',
-                  'adamw': {
-                      'weight_decay_rate':
-                          0.01,
-                      'exclude_from_weight_decay':
-                          ['LayerNorm', 'layer_norm', 'bias'],
-                  }
-              },
-              'learning_rate': {
-                  'type': 'polynomial',
-                  'polynomial': {
-                      'initial_learning_rate': 8e-5,
-                      'end_learning_rate': 0.0,
-                  }
-              },
-              'warmup': {
-                  'type': 'polynomial'
-              }
-          })),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None'
-      ])
-  config.task.model.encoder.type = 'bert'
-  return config
-
-
-@exp_factory.register_config_factory('bert/tagging')
-def bert_tagging() -> cfg.ExperimentConfig:
-  """BERT tagging task."""
-  config = cfg.ExperimentConfig(
-      task=tagging.TaggingConfig(
-          train_data=tagging_dataloader.TaggingDataConfig(),
-          validation_data=tagging_dataloader.TaggingDataConfig(
-              is_training=False, drop_remainder=False)),
-      trainer=cfg.TrainerConfig(
-          optimizer_config=optimization.OptimizationConfig({
-              'optimizer': {
-                  'type': 'adamw',
-                  'adamw': {
-                      'weight_decay_rate':
-                          0.01,
-                      'exclude_from_weight_decay':
-                          ['LayerNorm', 'layer_norm', 'bias'],
-                  }
-              },
-              'learning_rate': {
-                  'type': 'polynomial',
-                  'polynomial': {
-                      'initial_learning_rate': 8e-5,
-                      'end_learning_rate': 0.0,
-                  }
-              },
-              'warmup': {
-                  'type': 'polynomial'
-              }
-          })),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None',
-      ])
-  return config
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/models/bert_en_uncased_base.yaml b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/models/bert_en_uncased_base.yaml
deleted file mode 100644
index 1e49bc5430ed0135aa6d981421aad623f4f1fac9..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/models/bert_en_uncased_base.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-task:
-  model:
-    encoder:
-      type: bert
-      bert:
-        attention_dropout_rate: 0.1
-        dropout_rate: 0.1
-        hidden_activation: gelu
-        hidden_size: 768
-        initializer_range: 0.02
-        intermediate_size: 3072
-        max_position_embeddings: 512
-        num_attention_heads: 12
-        num_layers: 12
-        type_vocab_size: 2
-        vocab_size: 30522
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/pretraining_experiments.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/pretraining_experiments.py
deleted file mode 100644
index 7ef200989a5e773d6722d13aacd81c439ad9aebf..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/pretraining_experiments.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Pretraining experiment configurations."""
-# pylint: disable=g-doc-return-or-yield,line-too-long
-from official.core import config_definitions as cfg
-from official.core import exp_factory
-from official.modeling import optimization
-from official.nlp.data import pretrain_dataloader
-from official.nlp.data import pretrain_dynamic_dataloader
-from official.nlp.tasks import masked_lm
-
-_TRAINER = cfg.TrainerConfig(
-    train_steps=1000000,
-    optimizer_config=optimization.OptimizationConfig({
-        'optimizer': {
-            'type': 'adamw',
-            'adamw': {
-                'weight_decay_rate':
-                    0.01,
-                'exclude_from_weight_decay': [
-                    'LayerNorm', 'layer_norm', 'bias'
-                ],
-            }
-        },
-        'learning_rate': {
-            'type': 'polynomial',
-            'polynomial': {
-                'initial_learning_rate': 1e-4,
-                'end_learning_rate': 0.0,
-            }
-        },
-        'warmup': {
-            'type': 'polynomial'
-        }
-    }))
-
-
-@exp_factory.register_config_factory('bert/pretraining')
-def bert_pretraining() -> cfg.ExperimentConfig:
-  """BERT pretraining experiment."""
-  config = cfg.ExperimentConfig(
-      task=masked_lm.MaskedLMConfig(
-          train_data=pretrain_dataloader.BertPretrainDataConfig(),
-          validation_data=pretrain_dataloader.BertPretrainDataConfig(
-              is_training=False)),
-      trainer=_TRAINER,
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None'
-      ])
-  return config
-
-
-@exp_factory.register_config_factory('bert/pretraining_dynamic')
-def bert_dynamic() -> cfg.ExperimentConfig:
-  """BERT base with dynamic input sequences.
-
-  TPU needs to run with tf.data service with round-robin behavior.
-  """
-  config = cfg.ExperimentConfig(
-      task=masked_lm.MaskedLMConfig(
-          train_data=pretrain_dynamic_dataloader.BertPretrainDataConfig(),
-          validation_data=pretrain_dataloader.BertPretrainDataConfig(
-              is_training=False)),
-      trainer=_TRAINER,
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None'
-      ])
-  return config
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/wmt_transformer_experiments.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/wmt_transformer_experiments.py
deleted file mode 100644
index bddce16bf2d3a72c039271283ae9debbcbd528b5..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/configs/wmt_transformer_experiments.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Lint as: python3
-# pylint: disable=g-doc-return-or-yield,line-too-long
-"""WMT translation configurations."""
-
-from official.core import config_definitions as cfg
-from official.core import exp_factory
-from official.modeling import optimization
-from official.nlp.data import wmt_dataloader
-from official.nlp.tasks import translation
-
-
-@exp_factory.register_config_factory('wmt_transformer/large')
-def wmt_transformer_large() -> cfg.ExperimentConfig:
-  """WMT Transformer Large.
-
-  Please refer to
-  tensorflow_models/official/nlp/data/train_sentencepiece.py
-  to generate sentencepiece_model
-  and pass
-  --params_override=task.sentencepiece_model_path='YOUR_PATH'
-  to the train script.
-  """
-  learning_rate = 2.0
-  hidden_size = 1024
-  learning_rate *= (hidden_size**-0.5)
-  warmup_steps = 16000
-  train_steps = 300000
-  token_batch_size = 24576
-  encdecoder = translation.EncDecoder(
-      num_attention_heads=16, intermediate_size=hidden_size * 4)
-  config = cfg.ExperimentConfig(
-      task=translation.TranslationConfig(
-          model=translation.ModelConfig(
-              encoder=encdecoder,
-              decoder=encdecoder,
-              embedding_width=hidden_size,
-              padded_decode=True,
-              decode_max_length=100),
-          train_data=wmt_dataloader.WMTDataConfig(
-              tfds_name='wmt14_translate/de-en',
-              tfds_split='train',
-              src_lang='en',
-              tgt_lang='de',
-              is_training=True,
-              global_batch_size=token_batch_size,
-              static_batch=True,
-              max_seq_length=64
-          ),
-          validation_data=wmt_dataloader.WMTDataConfig(
-              tfds_name='wmt14_translate/de-en',
-              tfds_split='test',
-              src_lang='en',
-              tgt_lang='de',
-              is_training=False,
-              global_batch_size=32,
-              static_batch=True,
-              max_seq_length=100,
-          ),
-          sentencepiece_model_path=None,
-      ),
-      trainer=cfg.TrainerConfig(
-          train_steps=train_steps,
-          validation_steps=-1,
-          steps_per_loop=1000,
-          summary_interval=1000,
-          checkpoint_interval=5000,
-          validation_interval=5000,
-          max_to_keep=1,
-          optimizer_config=optimization.OptimizationConfig({
-              'optimizer': {
-                  'type': 'adam',
-                  'adam': {
-                      'beta_2': 0.997,
-                      'epsilon': 1e-9,
-                  },
-              },
-              'learning_rate': {
-                  'type': 'power',
-                  'power': {
-                      'initial_learning_rate': learning_rate,
-                      'power': -0.5,
-                  }
-              },
-              'warmup': {
-                  'type': 'linear',
-                  'linear': {
-                      'warmup_steps': warmup_steps,
-                      'warmup_learning_rate': 0.0
-                  }
-              }
-          })),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.sentencepiece_model_path != None',
-      ])
-  return config
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/continuous_finetune_lib.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/continuous_finetune_lib.py
deleted file mode 100644
index 5274e4b720110cf8116224c564b2d80b3790b2b8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/continuous_finetune_lib.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""TFM continuous finetuning+eval training driver library."""
-import gc
-import os
-import time
-from typing import Any, Mapping, Optional
-
-from absl import logging
-import tensorflow as tf
-
-from official.common import distribute_utils
-from official.core import config_definitions
-from official.core import task_factory
-from official.core import train_lib
-from official.core import train_utils
-from official.modeling import performance
-from official.modeling.multitask import configs
-from official.modeling.multitask import multitask
-from official.modeling.multitask import train_lib as multitask_train_lib
-
-
-def _flatten_dict(xs):
-  """Flatten a nested dictionary.
-
-  The nested keys are flattened to a tuple.
-
-  Example::
-
-    xs = {'foo': 1, 'bar': {'a': 2, 'b': {}}}
-    flat_xs = flatten_dict(xs)
-    print(flat_xs)
-    # {
-    #   ('foo',): 1,
-    #   ('bar', 'a'): 2,
-    # }
-
-  Note that empty dictionaries are ignored and
-  will not be restored by `unflatten_dict`.
-
-  Args:
-    xs: a nested dictionary
-
-  Returns:
-    The flattened dictionary.
-  """
-  assert isinstance(xs, dict), 'input is not a dict'
-
-  def _flatten(xs, prefix):
-    if not isinstance(xs, dict):
-      return {prefix: xs}
-    result = {}
-    for key, value in xs.items():
-      path = prefix + (key,)
-      result.update(_flatten(value, path))
-    return result
-
-  return _flatten(xs, ())
-
-
-def run_continuous_finetune(
-    mode: str,
-    params: config_definitions.ExperimentConfig,
-    model_dir: str,
-    run_post_eval: bool = False,
-    pretrain_steps: Optional[int] = None,
-) -> Mapping[str, Any]:
-  """Run modes with continuous training.
-
-  Currently only supports continuous_train_and_eval.
-
-  Args:
-    mode: A 'str', specifying the mode. continuous_train_and_eval - monitors a
-      checkpoint directory. Once a new checkpoint is discovered, loads the
-      checkpoint, finetune the model by training it (probably on another dataset
-      or with another task), then evaluate the finetuned model.
-    params: ExperimentConfig instance.
-    model_dir: A 'str', a path to store model checkpoints and summaries.
-    run_post_eval: Whether to run post eval once after training, metrics logs
-      are returned.
-    pretrain_steps: Optional, the number of total training steps for the
-      pretraining job.
-
-  Returns:
-    eval logs: returns eval metrics logs when run_post_eval is set to True,
-      othewise, returns {}.
-  """
-
-  assert mode == 'continuous_train_and_eval', (
-      'Only continuous_train_and_eval is supported by continuous_finetune. '
-      'Got mode: {}'.format(mode))
-
-  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
-  # can have significant impact on model speeds by utilizing float16 in case of
-  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
-  # dtype is float16
-  if params.runtime.mixed_precision_dtype:
-    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
-  distribution_strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=params.runtime.distribution_strategy,
-      all_reduce_alg=params.runtime.all_reduce_alg,
-      num_gpus=params.runtime.num_gpus,
-      tpu_address=params.runtime.tpu)
-
-  retry_times = 0
-  while not tf.io.gfile.isdir(params.task.init_checkpoint):
-    # Wait for the init_checkpoint directory to be created.
-    if retry_times >= 60:
-      raise ValueError(
-          'ExperimentConfig.task.init_checkpoint must be a directory for '
-          'continuous_train_and_eval mode.')
-    retry_times += 1
-    time.sleep(60)
-
-  summary_writer = tf.summary.create_file_writer(
-      os.path.join(model_dir, 'eval'))
-
-  global_step = 0
-
-  def timeout_fn():
-    if pretrain_steps and global_step < pretrain_steps:
-      # Keeps waiting for another timeout period.
-      logging.info(
-          'Continue waiting for new checkpoint as current pretrain '
-          'global_step=%d and target is %d.', global_step, pretrain_steps)
-      return False
-    # Quits the loop.
-    return True
-
-  for pretrain_ckpt in tf.train.checkpoints_iterator(
-      checkpoint_dir=params.task.init_checkpoint,
-      min_interval_secs=10,
-      timeout=params.trainer.continuous_eval_timeout,
-      timeout_fn=timeout_fn):
-
-    # If there are checkpoints, they might be the finetune checkpoint of a
-    # different pretrained checkpoint. So we just remove all checkpoints.
-    train_utils.remove_ckpts(model_dir)
-
-    with distribution_strategy.scope():
-      global_step = train_utils.read_global_step_from_checkpoint(pretrain_ckpt)
-    # Replaces params.task.init_checkpoint to make sure that we load
-    # exactly this pretrain checkpoint.
-    if params.trainer.best_checkpoint_export_subdir:
-      best_ckpt_subdir = '{}_{}'.format(
-          params.trainer.best_checkpoint_export_subdir, global_step)
-      params_replaced = params.replace(
-          task={'init_checkpoint': pretrain_ckpt},
-          trainer={'best_checkpoint_export_subdir': best_ckpt_subdir})
-    else:
-      params_replaced = params.replace(task={'init_checkpoint': pretrain_ckpt})
-    params_replaced.lock()
-    logging.info('Running finetuning with params: %s', params_replaced)
-
-    with distribution_strategy.scope():
-      if isinstance(params, configs.MultiEvalExperimentConfig):
-        task = task_factory.get_task(params_replaced.task)
-        eval_tasks = multitask.MultiTask.from_config(params_replaced.eval_tasks)
-        (_,
-         eval_metrics) = multitask_train_lib.run_experiment_with_multitask_eval(
-             distribution_strategy=distribution_strategy,
-             train_task=task,
-             eval_tasks=eval_tasks,
-             mode='train_and_eval',
-             params=params_replaced,
-             model_dir=model_dir,
-             run_post_eval=True,
-             save_summary=False)
-      else:
-        task = task_factory.get_task(
-            params_replaced.task, logging_dir=model_dir)
-        _, eval_metrics = train_lib.run_experiment(
-            distribution_strategy=distribution_strategy,
-            task=task,
-            mode='train_and_eval',
-            params=params_replaced,
-            model_dir=model_dir,
-            run_post_eval=True,
-            save_summary=False)
-    logging.info('Evaluation finished. Pretrain global_step: %d', global_step)
-    train_utils.write_json_summary(model_dir, global_step, eval_metrics)
-
-    if not os.path.basename(model_dir):  # if model_dir.endswith('/')
-      summary_grp = os.path.dirname(model_dir) + '_' + task.name
-    else:
-      summary_grp = os.path.basename(model_dir) + '_' + task.name
-    summaries = {}
-    for name, value in _flatten_dict(eval_metrics).items():
-      summaries[summary_grp + '/' + '-'.join(name)] = value
-    train_utils.write_summary(summary_writer, global_step, summaries)
-
-    train_utils.remove_ckpts(model_dir)
-    # In TF2, the resource life cycle is bound with the python object life
-    # cycle. Force trigger python garbage collection here so those resources
-    # can be deallocated in time, so it doesn't cause OOM when allocating new
-    # objects.
-    # TODO(b/169178664): Fix cycle reference in Keras model and revisit to see
-    # if we need gc here.
-    gc.collect()
-
-  if run_post_eval:
-    return eval_metrics
-  return {}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/continuous_finetune_lib_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/continuous_finetune_lib_test.py
deleted file mode 100644
index f43902f687c141a6789a8b6edd97ea3fb973616e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/continuous_finetune_lib_test.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import os
-
-from absl import flags
-from absl.testing import flagsaver
-from absl.testing import parameterized
-import tensorflow as tf
-
-# pylint: disable=unused-import
-from official.common import registry_imports
-# pylint: enable=unused-import
-from official.common import flags as tfm_flags
-from official.core import task_factory
-from official.core import train_lib
-from official.core import train_utils
-from official.nlp import continuous_finetune_lib
-
-FLAGS = flags.FLAGS
-
-tfm_flags.define_flags()
-
-
-class ContinuousFinetuneTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self._model_dir = os.path.join(self.get_temp_dir(), 'model_dir')
-
-  def testContinuousFinetune(self):
-    pretrain_steps = 1
-    src_model_dir = self.get_temp_dir()
-    flags_dict = dict(
-        experiment='mock',
-        mode='continuous_train_and_eval',
-        model_dir=self._model_dir,
-        params_override={
-            'task': {
-                'init_checkpoint': src_model_dir,
-            },
-            'trainer': {
-                'continuous_eval_timeout': 1,
-                'steps_per_loop': 1,
-                'train_steps': 1,
-                'validation_steps': 1,
-                'best_checkpoint_export_subdir': 'best_ckpt',
-                'best_checkpoint_eval_metric': 'acc',
-                'optimizer_config': {
-                    'optimizer': {
-                        'type': 'sgd'
-                    },
-                    'learning_rate': {
-                        'type': 'constant'
-                    }
-                }
-            }
-        })
-
-    with flagsaver.flagsaver(**flags_dict):
-      # Train and save some checkpoints.
-      params = train_utils.parse_configuration(flags.FLAGS)
-      distribution_strategy = tf.distribute.get_strategy()
-      with distribution_strategy.scope():
-        task = task_factory.get_task(params.task, logging_dir=src_model_dir)
-      _ = train_lib.run_experiment(
-          distribution_strategy=distribution_strategy,
-          task=task,
-          mode='train',
-          params=params,
-          model_dir=src_model_dir)
-
-      params = train_utils.parse_configuration(FLAGS)
-      eval_metrics = continuous_finetune_lib.run_continuous_finetune(
-          FLAGS.mode,
-          params,
-          FLAGS.model_dir,
-          run_post_eval=True,
-          pretrain_steps=pretrain_steps)
-      self.assertIn('best_acc', eval_metrics)
-
-      self.assertFalse(
-          tf.io.gfile.exists(os.path.join(FLAGS.model_dir, 'checkpoint')))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/classifier_data_lib.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/classifier_data_lib.py
deleted file mode 100644
index 6936b7eb3a8740e7c48e5e82791f054c6acd48b7..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/classifier_data_lib.py
+++ /dev/null
@@ -1,1528 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""BERT library to process data for classification task."""
-
-import collections
-import csv
-import importlib
-import json
-import os
-
-from absl import logging
-import tensorflow as tf
-import tensorflow_datasets as tfds
-
-from official.nlp.bert import tokenization
-
-
-class InputExample(object):
-  """A single training/test example for simple seq regression/classification."""
-
-  def __init__(self,
-               guid,
-               text_a,
-               text_b=None,
-               label=None,
-               weight=None,
-               example_id=None):
-    """Constructs a InputExample.
-
-    Args:
-      guid: Unique id for the example.
-      text_a: string. The untokenized text of the first sequence. For single
-        sequence tasks, only this sequence must be specified.
-      text_b: (Optional) string. The untokenized text of the second sequence.
-        Only must be specified for sequence pair tasks.
-      label: (Optional) string for classification, float for regression. The
-        label of the example. This should be specified for train and dev
-        examples, but not for test examples.
-      weight: (Optional) float. The weight of the example to be used during
-        training.
-      example_id: (Optional) int. The int identification number of example in
-        the corpus.
-    """
-    self.guid = guid
-    self.text_a = text_a
-    self.text_b = text_b
-    self.label = label
-    self.weight = weight
-    self.example_id = example_id
-
-
-class InputFeatures(object):
-  """A single set of features of data."""
-
-  def __init__(self,
-               input_ids,
-               input_mask,
-               segment_ids,
-               label_id,
-               is_real_example=True,
-               weight=None,
-               example_id=None):
-    self.input_ids = input_ids
-    self.input_mask = input_mask
-    self.segment_ids = segment_ids
-    self.label_id = label_id
-    self.is_real_example = is_real_example
-    self.weight = weight
-    self.example_id = example_id
-
-
-class DataProcessor(object):
-  """Base class for converters for seq regression/classification datasets."""
-
-  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
-    self.process_text_fn = process_text_fn
-    self.is_regression = False
-    self.label_type = None
-
-  def get_train_examples(self, data_dir):
-    """Gets a collection of `InputExample`s for the train set."""
-    raise NotImplementedError()
-
-  def get_dev_examples(self, data_dir):
-    """Gets a collection of `InputExample`s for the dev set."""
-    raise NotImplementedError()
-
-  def get_test_examples(self, data_dir):
-    """Gets a collection of `InputExample`s for prediction."""
-    raise NotImplementedError()
-
-  def get_labels(self):
-    """Gets the list of labels for this data set."""
-    raise NotImplementedError()
-
-  @staticmethod
-  def get_processor_name():
-    """Gets the string identifier of the processor."""
-    raise NotImplementedError()
-
-  @classmethod
-  def _read_tsv(cls, input_file, quotechar=None):
-    """Reads a tab separated value file."""
-    with tf.io.gfile.GFile(input_file, "r") as f:
-      reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-      lines = []
-      for line in reader:
-        lines.append(line)
-      return lines
-
-  @classmethod
-  def _read_jsonl(cls, input_file):
-    """Reads a json line file."""
-    with tf.io.gfile.GFile(input_file, "r") as f:
-      lines = []
-      for json_str in f:
-        lines.append(json.loads(json_str))
-    return lines
-
-
-class AxProcessor(DataProcessor):
-  """Processor for the AX dataset (GLUE diagnostics dataset)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["contradiction", "entailment", "neutral"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "AX"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    text_a_index = 1 if set_type == "test" else 8
-    text_b_index = 2 if set_type == "test" else 9
-    examples = []
-    for i, line in enumerate(lines):
-      # Skip header.
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, self.process_text_fn(line[0]))
-      text_a = self.process_text_fn(line[text_a_index])
-      text_b = self.process_text_fn(line[text_b_index])
-      if set_type == "test":
-        label = "contradiction"
-      else:
-        label = self.process_text_fn(line[-1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-class ColaProcessor(DataProcessor):
-  """Processor for the CoLA data set (GLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "COLA"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for i, line in enumerate(lines):
-      # Only the test set has a header.
-      if set_type == "test" and i == 0:
-        continue
-      guid = "%s-%s" % (set_type, i)
-      if set_type == "test":
-        text_a = self.process_text_fn(line[1])
-        label = "0"
-      else:
-        text_a = self.process_text_fn(line[3])
-        label = self.process_text_fn(line[1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-    return examples
-
-
-class ImdbProcessor(DataProcessor):
-  """Processor for the IMDb dataset."""
-
-  def get_labels(self):
-    return ["neg", "pos"]
-
-  def get_train_examples(self, data_dir):
-    return self._create_examples(os.path.join(data_dir, "train"))
-
-  def get_dev_examples(self, data_dir):
-    return self._create_examples(os.path.join(data_dir, "test"))
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "IMDB"
-
-  def _create_examples(self, data_dir):
-    """Creates examples."""
-    examples = []
-    for label in ["neg", "pos"]:
-      cur_dir = os.path.join(data_dir, label)
-      for filename in tf.io.gfile.listdir(cur_dir):
-        if not filename.endswith("txt"):
-          continue
-
-        if len(examples) % 1000 == 0:
-          logging.info("Loading dev example %d", len(examples))
-
-        path = os.path.join(cur_dir, filename)
-        with tf.io.gfile.GFile(path, "r") as f:
-          text = f.read().strip().replace("<br />", " ")
-        examples.append(
-            InputExample(
-                guid="unused_id", text_a=text, text_b=None, label=label))
-    return examples
-
-
-class MnliProcessor(DataProcessor):
-  """Processor for the MultiNLI data set (GLUE version)."""
-
-  def __init__(self,
-               mnli_type="matched",
-               process_text_fn=tokenization.convert_to_unicode):
-    super(MnliProcessor, self).__init__(process_text_fn)
-    if mnli_type not in ("matched", "mismatched"):
-      raise ValueError("Invalid `mnli_type`: %s" % mnli_type)
-    self.mnli_type = mnli_type
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    if self.mnli_type == "matched":
-      return self._create_examples(
-          self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-          "dev_matched")
-    else:
-      return self._create_examples(
-          self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")),
-          "dev_mismatched")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    if self.mnli_type == "matched":
-      return self._create_examples(
-          self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
-    else:
-      return self._create_examples(
-          self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["contradiction", "entailment", "neutral"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "MNLI"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, self.process_text_fn(line[0]))
-      text_a = self.process_text_fn(line[8])
-      text_b = self.process_text_fn(line[9])
-      if set_type == "test":
-        label = "contradiction"
-      else:
-        label = self.process_text_fn(line[-1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-class MrpcProcessor(DataProcessor):
-  """Processor for the MRPC data set (GLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "MRPC"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, i)
-      text_a = self.process_text_fn(line[3])
-      text_b = self.process_text_fn(line[4])
-      if set_type == "test":
-        label = "0"
-      else:
-        label = self.process_text_fn(line[0])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-class PawsxProcessor(DataProcessor):
-  """Processor for the PAWS-X data set."""
-  supported_languages = ["de", "en", "es", "fr", "ja", "ko", "zh"]
-
-  def __init__(self,
-               language="en",
-               process_text_fn=tokenization.convert_to_unicode):
-    super(PawsxProcessor, self).__init__(process_text_fn)
-    if language == "all":
-      self.languages = PawsxProcessor.supported_languages
-    elif language not in PawsxProcessor.supported_languages:
-      raise ValueError("language %s is not supported for PAWS-X task." %
-                       language)
-    else:
-      self.languages = [language]
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    lines = []
-    for language in self.languages:
-      if language == "en":
-        train_tsv = "train.tsv"
-      else:
-        train_tsv = "translated_train.tsv"
-      # Skips the header.
-      lines.extend(
-          self._read_tsv(os.path.join(data_dir, language, train_tsv))[1:])
-
-    examples = []
-    for i, line in enumerate(lines):
-      guid = "train-%d" % i
-      text_a = self.process_text_fn(line[1])
-      text_b = self.process_text_fn(line[2])
-      label = self.process_text_fn(line[3])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    lines = []
-    for lang in PawsxProcessor.supported_languages:
-      lines.extend(
-          self._read_tsv(os.path.join(data_dir, lang, "dev_2k.tsv"))[1:])
-
-    examples = []
-    for i, line in enumerate(lines):
-      guid = "dev-%d" % i
-      text_a = self.process_text_fn(line[1])
-      text_b = self.process_text_fn(line[2])
-      label = self.process_text_fn(line[3])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    examples_by_lang = {k: [] for k in self.supported_languages}
-    for lang in self.supported_languages:
-      lines = self._read_tsv(os.path.join(data_dir, lang, "test_2k.tsv"))[1:]
-      for i, line in enumerate(lines):
-        guid = "test-%d" % i
-        text_a = self.process_text_fn(line[1])
-        text_b = self.process_text_fn(line[2])
-        label = self.process_text_fn(line[3])
-        examples_by_lang[lang].append(
-            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples_by_lang
-
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "XTREME-PAWS-X"
-
-
-class QnliProcessor(DataProcessor):
-  """Processor for the QNLI data set (GLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["entailment", "not_entailment"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "QNLI"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, 1)
-      if set_type == "test":
-        text_a = tokenization.convert_to_unicode(line[1])
-        text_b = tokenization.convert_to_unicode(line[2])
-        label = "entailment"
-      else:
-        text_a = tokenization.convert_to_unicode(line[1])
-        text_b = tokenization.convert_to_unicode(line[2])
-        label = tokenization.convert_to_unicode(line[-1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-class QqpProcessor(DataProcessor):
-  """Processor for the QQP data set (GLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "QQP"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, line[0])
-      if set_type == "test":
-        text_a = line[1]
-        text_b = line[2]
-        label = "0"
-      else:
-        # There appear to be some garbage lines in the train dataset.
-        try:
-          text_a = line[3]
-          text_b = line[4]
-          label = line[5]
-        except IndexError:
-          continue
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-class RteProcessor(DataProcessor):
-  """Processor for the RTE data set (GLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    # All datasets are converted to 2-class split, where for 3-class datasets we
-    # collapse neutral and contradiction into not_entailment.
-    return ["entailment", "not_entailment"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "RTE"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, i)
-      text_a = tokenization.convert_to_unicode(line[1])
-      text_b = tokenization.convert_to_unicode(line[2])
-      if set_type == "test":
-        label = "entailment"
-      else:
-        label = tokenization.convert_to_unicode(line[3])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-class SstProcessor(DataProcessor):
-  """Processor for the SST-2 data set (GLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "SST-2"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, i)
-      if set_type == "test":
-        text_a = tokenization.convert_to_unicode(line[1])
-        label = "0"
-      else:
-        text_a = tokenization.convert_to_unicode(line[0])
-        label = tokenization.convert_to_unicode(line[1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-    return examples
-
-
-class StsBProcessor(DataProcessor):
-  """Processor for the STS-B data set (GLUE version)."""
-
-  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
-    super(StsBProcessor, self).__init__(process_text_fn=process_text_fn)
-    self.is_regression = True
-    self.label_type = float
-    self._labels = None
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return self._labels
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "STS-B"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, i)
-      text_a = tokenization.convert_to_unicode(line[7])
-      text_b = tokenization.convert_to_unicode(line[8])
-      if set_type == "test":
-        label = 0.0
-      else:
-        label = self.label_type(tokenization.convert_to_unicode(line[9]))
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-class TfdsProcessor(DataProcessor):
-  """Processor for generic text classification and regression TFDS data set.
-
-  The TFDS parameters are expected to be provided in the tfds_params string, in
-  a comma-separated list of parameter assignments.
-  Examples:
-    tfds_params="dataset=scicite,text_key=string"
-    tfds_params="dataset=imdb_reviews,test_split=,dev_split=test"
-    tfds_params="dataset=glue/cola,text_key=sentence"
-    tfds_params="dataset=glue/sst2,text_key=sentence"
-    tfds_params="dataset=glue/qnli,text_key=question,text_b_key=sentence"
-    tfds_params="dataset=glue/mrpc,text_key=sentence1,text_b_key=sentence2"
-    tfds_params="dataset=glue/stsb,text_key=sentence1,text_b_key=sentence2,"
-                "is_regression=true,label_type=float"
-    tfds_params="dataset=snli,text_key=premise,text_b_key=hypothesis,"
-                "skip_label=-1"
-  Possible parameters (please refer to the documentation of Tensorflow Datasets
-  (TFDS) for the meaning of individual parameters):
-    dataset: Required dataset name (potentially with subset and version number).
-    data_dir: Optional TFDS source root directory.
-    module_import: Optional Dataset module to import.
-    train_split: Name of the train split (defaults to `train`).
-    dev_split: Name of the dev split (defaults to `validation`).
-    test_split: Name of the test split (defaults to `test`).
-    text_key: Key of the text_a feature (defaults to `text`).
-    text_b_key: Key of the second text feature if available.
-    label_key: Key of the label feature (defaults to `label`).
-    test_text_key: Key of the text feature to use in test set.
-    test_text_b_key: Key of the second text feature to use in test set.
-    test_label: String to be used as the label for all test examples.
-    label_type: Type of the label key (defaults to `int`).
-    weight_key: Key of the float sample weight (is not used if not provided).
-    is_regression: Whether the task is a regression problem (defaults to False).
-    skip_label: Skip examples with given label (defaults to None).
-  """
-
-  def __init__(self,
-               tfds_params,
-               process_text_fn=tokenization.convert_to_unicode):
-    super(TfdsProcessor, self).__init__(process_text_fn)
-    self._process_tfds_params_str(tfds_params)
-    if self.module_import:
-      importlib.import_module(self.module_import)
-
-    self.dataset, info = tfds.load(
-        self.dataset_name, data_dir=self.data_dir, with_info=True)
-    if self.is_regression:
-      self._labels = None
-    else:
-      self._labels = list(range(info.features[self.label_key].num_classes))
-
-  def _process_tfds_params_str(self, params_str):
-    """Extracts TFDS parameters from a comma-separated assignements string."""
-    dtype_map = {"int": int, "float": float}
-    cast_str_to_bool = lambda s: s.lower() not in ["false", "0"]
-
-    tuples = [x.split("=") for x in params_str.split(",")]
-    d = {k.strip(): v.strip() for k, v in tuples}
-    self.dataset_name = d["dataset"]  # Required.
-    self.data_dir = d.get("data_dir", None)
-    self.module_import = d.get("module_import", None)
-    self.train_split = d.get("train_split", "train")
-    self.dev_split = d.get("dev_split", "validation")
-    self.test_split = d.get("test_split", "test")
-    self.text_key = d.get("text_key", "text")
-    self.text_b_key = d.get("text_b_key", None)
-    self.label_key = d.get("label_key", "label")
-    self.test_text_key = d.get("test_text_key", self.text_key)
-    self.test_text_b_key = d.get("test_text_b_key", self.text_b_key)
-    self.test_label = d.get("test_label", "test_example")
-    self.label_type = dtype_map[d.get("label_type", "int")]
-    self.is_regression = cast_str_to_bool(d.get("is_regression", "False"))
-    self.weight_key = d.get("weight_key", None)
-    self.skip_label = d.get("skip_label", None)
-    if self.skip_label is not None:
-      self.skip_label = self.label_type(self.skip_label)
-
-  def get_train_examples(self, data_dir):
-    assert data_dir is None
-    return self._create_examples(self.train_split, "train")
-
-  def get_dev_examples(self, data_dir):
-    assert data_dir is None
-    return self._create_examples(self.dev_split, "dev")
-
-  def get_test_examples(self, data_dir):
-    assert data_dir is None
-    return self._create_examples(self.test_split, "test")
-
-  def get_labels(self):
-    return self._labels
-
-  def get_processor_name(self):
-    return "TFDS_" + self.dataset_name
-
-  def _create_examples(self, split_name, set_type):
-    """Creates examples for the training/dev/test sets."""
-    if split_name not in self.dataset:
-      raise ValueError("Split {} not available.".format(split_name))
-    dataset = self.dataset[split_name].as_numpy_iterator()
-    examples = []
-    text_b, weight = None, None
-    for i, example in enumerate(dataset):
-      guid = "%s-%s" % (set_type, i)
-      if set_type == "test":
-        text_a = self.process_text_fn(example[self.test_text_key])
-        if self.test_text_b_key:
-          text_b = self.process_text_fn(example[self.test_text_b_key])
-        label = self.test_label
-      else:
-        text_a = self.process_text_fn(example[self.text_key])
-        if self.text_b_key:
-          text_b = self.process_text_fn(example[self.text_b_key])
-        label = self.label_type(example[self.label_key])
-        if self.skip_label is not None and label == self.skip_label:
-          continue
-      if self.weight_key:
-        weight = float(example[self.weight_key])
-      examples.append(
-          InputExample(
-              guid=guid,
-              text_a=text_a,
-              text_b=text_b,
-              label=label,
-              weight=weight))
-    return examples
-
-
-class WnliProcessor(DataProcessor):
-  """Processor for the WNLI data set (GLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "WNLI"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, i)
-      text_a = tokenization.convert_to_unicode(line[1])
-      text_b = tokenization.convert_to_unicode(line[2])
-      if set_type == "test":
-        label = "0"
-      else:
-        label = tokenization.convert_to_unicode(line[3])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-class XnliProcessor(DataProcessor):
-  """Processor for the XNLI data set."""
-  supported_languages = [
-      "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr",
-      "ur", "vi", "zh"
-  ]
-
-  def __init__(self,
-               language="en",
-               process_text_fn=tokenization.convert_to_unicode):
-    super(XnliProcessor, self).__init__(process_text_fn)
-    if language == "all":
-      self.languages = XnliProcessor.supported_languages
-    elif language not in XnliProcessor.supported_languages:
-      raise ValueError("language %s is not supported for XNLI task." % language)
-    else:
-      self.languages = [language]
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    lines = []
-    for language in self.languages:
-      # Skips the header.
-      lines.extend(
-          self._read_tsv(
-              os.path.join(data_dir, "multinli",
-                           "multinli.train.%s.tsv" % language))[1:])
-
-    examples = []
-    for i, line in enumerate(lines):
-      guid = "train-%d" % i
-      text_a = self.process_text_fn(line[0])
-      text_b = self.process_text_fn(line[1])
-      label = self.process_text_fn(line[2])
-      if label == self.process_text_fn("contradictory"):
-        label = self.process_text_fn("contradiction")
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
-    examples = []
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "dev-%d" % i
-      text_a = self.process_text_fn(line[6])
-      text_b = self.process_text_fn(line[7])
-      label = self.process_text_fn(line[1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    lines = self._read_tsv(os.path.join(data_dir, "xnli.test.tsv"))
-    examples_by_lang = {k: [] for k in XnliProcessor.supported_languages}
-    for i, line in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "test-%d" % i
-      language = self.process_text_fn(line[0])
-      text_a = self.process_text_fn(line[6])
-      text_b = self.process_text_fn(line[7])
-      label = self.process_text_fn(line[1])
-      examples_by_lang[language].append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples_by_lang
-
-  def get_labels(self):
-    """See base class."""
-    return ["contradiction", "entailment", "neutral"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "XNLI"
-
-
-class XtremePawsxProcessor(DataProcessor):
-  """Processor for the XTREME PAWS-X data set."""
-  supported_languages = ["de", "en", "es", "fr", "ja", "ko", "zh"]
-
-  def __init__(self,
-               process_text_fn=tokenization.convert_to_unicode,
-               translated_data_dir=None,
-               only_use_en_dev=True):
-    """See base class.
-
-    Args:
-      process_text_fn: See base class.
-      translated_data_dir: If specified, will also include translated data in
-        the training and testing data.
-      only_use_en_dev: If True, only use english dev data. Otherwise, use dev
-        data from all languages.
-    """
-    super(XtremePawsxProcessor, self).__init__(process_text_fn)
-    self.translated_data_dir = translated_data_dir
-    self.only_use_en_dev = only_use_en_dev
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    examples = []
-    if self.translated_data_dir is None:
-      lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv"))
-      for i, line in enumerate(lines):
-        guid = "train-%d" % i
-        text_a = self.process_text_fn(line[0])
-        text_b = self.process_text_fn(line[1])
-        label = self.process_text_fn(line[2])
-        examples.append(
-            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    else:
-      for lang in self.supported_languages:
-        lines = self._read_tsv(
-            os.path.join(self.translated_data_dir, "translate-train",
-                         f"en-{lang}-translated.tsv"))
-        for i, line in enumerate(lines):
-          guid = f"train-{lang}-{i}"
-          text_a = self.process_text_fn(line[2])
-          text_b = self.process_text_fn(line[3])
-          label = self.process_text_fn(line[4])
-          examples.append(
-              InputExample(
-                  guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    examples = []
-    if self.only_use_en_dev:
-      lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv"))
-      for i, line in enumerate(lines):
-        guid = "dev-%d" % i
-        text_a = self.process_text_fn(line[0])
-        text_b = self.process_text_fn(line[1])
-        label = self.process_text_fn(line[2])
-        examples.append(
-            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    else:
-      for lang in self.supported_languages:
-        lines = self._read_tsv(os.path.join(data_dir, f"dev-{lang}.tsv"))
-        for i, line in enumerate(lines):
-          guid = f"dev-{lang}-{i}"
-          text_a = self.process_text_fn(line[0])
-          text_b = self.process_text_fn(line[1])
-          label = self.process_text_fn(line[2])
-          examples.append(
-              InputExample(
-                  guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    examples_by_lang = {}
-    for lang in self.supported_languages:
-      examples_by_lang[lang] = []
-      lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv"))
-      for i, line in enumerate(lines):
-        guid = f"test-{lang}-{i}"
-        text_a = self.process_text_fn(line[0])
-        text_b = self.process_text_fn(line[1])
-        label = "0"
-        examples_by_lang[lang].append(
-            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    if self.translated_data_dir is not None:
-      for lang in self.supported_languages:
-        if lang == "en":
-          continue
-        examples_by_lang[f"{lang}-en"] = []
-        lines = self._read_tsv(
-            os.path.join(self.translated_data_dir, "translate-test",
-                         f"test-{lang}-en-translated.tsv"))
-        for i, line in enumerate(lines):
-          guid = f"test-{lang}-en-{i}"
-          text_a = self.process_text_fn(line[2])
-          text_b = self.process_text_fn(line[3])
-          label = "0"
-          examples_by_lang[f"{lang}-en"].append(
-              InputExample(
-                  guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples_by_lang
-
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "XTREME-PAWS-X"
-
-
-class XtremeXnliProcessor(DataProcessor):
-  """Processor for the XTREME XNLI data set."""
-  supported_languages = [
-      "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr",
-      "ur", "vi", "zh"
-  ]
-
-  def __init__(self,
-               process_text_fn=tokenization.convert_to_unicode,
-               translated_data_dir=None,
-               only_use_en_dev=True):
-    """See base class.
-
-    Args:
-      process_text_fn: See base class.
-      translated_data_dir: If specified, will also include translated data in
-        the training data.
-      only_use_en_dev: If True, only use english dev data. Otherwise, use dev
-        data from all languages.
-    """
-    super(XtremeXnliProcessor, self).__init__(process_text_fn)
-    self.translated_data_dir = translated_data_dir
-    self.only_use_en_dev = only_use_en_dev
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    lines = self._read_tsv(os.path.join(data_dir, "train-en.tsv"))
-
-    examples = []
-    if self.translated_data_dir is None:
-      for i, line in enumerate(lines):
-        guid = "train-%d" % i
-        text_a = self.process_text_fn(line[0])
-        text_b = self.process_text_fn(line[1])
-        label = self.process_text_fn(line[2])
-        if label == self.process_text_fn("contradictory"):
-          label = self.process_text_fn("contradiction")
-        examples.append(
-            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    else:
-      for lang in self.supported_languages:
-        lines = self._read_tsv(
-            os.path.join(self.translated_data_dir, "translate-train",
-                         f"en-{lang}-translated.tsv"))
-        for i, line in enumerate(lines):
-          guid = f"train-{lang}-{i}"
-          text_a = self.process_text_fn(line[2])
-          text_b = self.process_text_fn(line[3])
-          label = self.process_text_fn(line[4])
-          if label == self.process_text_fn("contradictory"):
-            label = self.process_text_fn("contradiction")
-          examples.append(
-              InputExample(
-                  guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    examples = []
-    if self.only_use_en_dev:
-      lines = self._read_tsv(os.path.join(data_dir, "dev-en.tsv"))
-      for i, line in enumerate(lines):
-        guid = "dev-%d" % i
-        text_a = self.process_text_fn(line[0])
-        text_b = self.process_text_fn(line[1])
-        label = self.process_text_fn(line[2])
-        examples.append(
-            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    else:
-      for lang in self.supported_languages:
-        lines = self._read_tsv(os.path.join(data_dir, f"dev-{lang}.tsv"))
-        for i, line in enumerate(lines):
-          guid = f"dev-{lang}-{i}"
-          text_a = self.process_text_fn(line[0])
-          text_b = self.process_text_fn(line[1])
-          label = self.process_text_fn(line[2])
-          if label == self.process_text_fn("contradictory"):
-            label = self.process_text_fn("contradiction")
-          examples.append(
-              InputExample(
-                  guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    examples_by_lang = {}
-    for lang in self.supported_languages:
-      examples_by_lang[lang] = []
-      lines = self._read_tsv(os.path.join(data_dir, f"test-{lang}.tsv"))
-      for i, line in enumerate(lines):
-        guid = f"test-{lang}-{i}"
-        text_a = self.process_text_fn(line[0])
-        text_b = self.process_text_fn(line[1])
-        label = "contradiction"
-        examples_by_lang[lang].append(
-            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    if self.translated_data_dir is not None:
-      for lang in self.supported_languages:
-        if lang == "en":
-          continue
-        examples_by_lang[f"{lang}-en"] = []
-        lines = self._read_tsv(
-            os.path.join(self.translated_data_dir, "translate-test",
-                         f"test-{lang}-en-translated.tsv"))
-        for i, line in enumerate(lines):
-          guid = f"test-{lang}-en-{i}"
-          text_a = self.process_text_fn(line[2])
-          text_b = self.process_text_fn(line[3])
-          label = "contradiction"
-          examples_by_lang[f"{lang}-en"].append(
-              InputExample(
-                  guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples_by_lang
-
-  def get_labels(self):
-    """See base class."""
-    return ["contradiction", "entailment", "neutral"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "XTREME-XNLI"
-
-
-def convert_single_example(ex_index, example, label_list, max_seq_length,
-                           tokenizer):
-  """Converts a single `InputExample` into a single `InputFeatures`."""
-  label_map = {}
-  if label_list:
-    for (i, label) in enumerate(label_list):
-      label_map[label] = i
-
-  tokens_a = tokenizer.tokenize(example.text_a)
-  tokens_b = None
-  if example.text_b:
-    tokens_b = tokenizer.tokenize(example.text_b)
-
-  if tokens_b:
-    # Modifies `tokens_a` and `tokens_b` in place so that the total
-    # length is less than the specified length.
-    # Account for [CLS], [SEP], [SEP] with "- 3"
-    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-  else:
-    # Account for [CLS] and [SEP] with "- 2"
-    if len(tokens_a) > max_seq_length - 2:
-      tokens_a = tokens_a[0:(max_seq_length - 2)]
-
-  seg_id_a = 0
-  seg_id_b = 1
-  seg_id_cls = 0
-  seg_id_pad = 0
-
-  # The convention in BERT is:
-  # (a) For sequence pairs:
-  #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-  #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-  # (b) For single sequences:
-  #  tokens:   [CLS] the dog is hairy . [SEP]
-  #  type_ids: 0     0   0   0  0     0 0
-  #
-  # Where "type_ids" are used to indicate whether this is the first
-  # sequence or the second sequence. The embedding vectors for `type=0` and
-  # `type=1` were learned during pre-training and are added to the wordpiece
-  # embedding vector (and position vector). This is not *strictly* necessary
-  # since the [SEP] token unambiguously separates the sequences, but it makes
-  # it easier for the model to learn the concept of sequences.
-  #
-  # For classification tasks, the first vector (corresponding to [CLS]) is
-  # used as the "sentence vector". Note that this only makes sense because
-  # the entire model is fine-tuned.
-  tokens = []
-  segment_ids = []
-  tokens.append("[CLS]")
-  segment_ids.append(seg_id_cls)
-  for token in tokens_a:
-    tokens.append(token)
-    segment_ids.append(seg_id_a)
-  tokens.append("[SEP]")
-  segment_ids.append(seg_id_a)
-
-  if tokens_b:
-    for token in tokens_b:
-      tokens.append(token)
-      segment_ids.append(seg_id_b)
-    tokens.append("[SEP]")
-    segment_ids.append(seg_id_b)
-
-  input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-  # The mask has 1 for real tokens and 0 for padding tokens. Only real
-  # tokens are attended to.
-  input_mask = [1] * len(input_ids)
-
-  # Zero-pad up to the sequence length.
-  while len(input_ids) < max_seq_length:
-    input_ids.append(0)
-    input_mask.append(0)
-    segment_ids.append(seg_id_pad)
-
-  assert len(input_ids) == max_seq_length
-  assert len(input_mask) == max_seq_length
-  assert len(segment_ids) == max_seq_length
-
-  label_id = label_map[example.label] if label_map else example.label
-  if ex_index < 5:
-    logging.info("*** Example ***")
-    logging.info("guid: %s", (example.guid))
-    logging.info("tokens: %s",
-                 " ".join([tokenization.printable_text(x) for x in tokens]))
-    logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
-    logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
-    logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
-    logging.info("label: %s (id = %s)", example.label, str(label_id))
-    logging.info("weight: %s", example.weight)
-    logging.info("example_id: %s", example.example_id)
-
-  feature = InputFeatures(
-      input_ids=input_ids,
-      input_mask=input_mask,
-      segment_ids=segment_ids,
-      label_id=label_id,
-      is_real_example=True,
-      weight=example.weight,
-      example_id=example.example_id)
-
-  return feature
-
-
-class AXgProcessor(DataProcessor):
-  """Processor for the AXg dataset (SuperGLUE diagnostics dataset)."""
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "AX-g.jsonl")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["entailment", "not_entailment"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "AXg"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for line in lines:
-      guid = "%s-%s" % (set_type, self.process_text_fn(str(line["idx"])))
-      text_a = self.process_text_fn(line["premise"])
-      text_b = self.process_text_fn(line["hypothesis"])
-      label = self.process_text_fn(line["label"])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-class SuperGLUERTEProcessor(DataProcessor):
-  """Processor for the RTE dataset (SuperGLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "train.jsonl")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "val.jsonl")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "test.jsonl")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    # All datasets are converted to 2-class split, where for 3-class datasets we
-    # collapse neutral and contradiction into not_entailment.
-    return ["entailment", "not_entailment"]
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "RTESuperGLUE"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training/dev/test sets."""
-    examples = []
-    for i, line in enumerate(lines):
-      guid = "%s-%s" % (set_type, i)
-      text_a = self.process_text_fn(line["premise"])
-      text_b = self.process_text_fn(line["hypothesis"])
-      if set_type == "test":
-        label = "entailment"
-      else:
-        label = self.process_text_fn(line["label"])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-def file_based_convert_examples_to_features(examples,
-                                            label_list,
-                                            max_seq_length,
-                                            tokenizer,
-                                            output_file,
-                                            label_type=None):
-  """Convert a set of `InputExample`s to a TFRecord file."""
-
-  tf.io.gfile.makedirs(os.path.dirname(output_file))
-  writer = tf.io.TFRecordWriter(output_file)
-
-  for ex_index, example in enumerate(examples):
-    if ex_index % 10000 == 0:
-      logging.info("Writing example %d of %d", ex_index, len(examples))
-
-    feature = convert_single_example(ex_index, example, label_list,
-                                     max_seq_length, tokenizer)
-
-    def create_int_feature(values):
-      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-      return f
-
-    def create_float_feature(values):
-      f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
-      return f
-
-    features = collections.OrderedDict()
-    features["input_ids"] = create_int_feature(feature.input_ids)
-    features["input_mask"] = create_int_feature(feature.input_mask)
-    features["segment_ids"] = create_int_feature(feature.segment_ids)
-    if label_type is not None and label_type == float:
-      features["label_ids"] = create_float_feature([feature.label_id])
-    elif feature.label_id is not None:
-      features["label_ids"] = create_int_feature([feature.label_id])
-    features["is_real_example"] = create_int_feature(
-        [int(feature.is_real_example)])
-    if feature.weight is not None:
-      features["weight"] = create_float_feature([feature.weight])
-    if feature.example_id is not None:
-      features["example_id"] = create_int_feature([feature.example_id])
-    else:
-      features["example_id"] = create_int_feature([ex_index])
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-  """Truncates a sequence pair in place to the maximum length."""
-
-  # This is a simple heuristic which will always truncate the longer sequence
-  # one token at a time. This makes more sense than truncating an equal percent
-  # of tokens from each, since if one sequence is very short then each token
-  # that's truncated likely contains more information than a longer sequence.
-  while True:
-    total_length = len(tokens_a) + len(tokens_b)
-    if total_length <= max_length:
-      break
-    if len(tokens_a) > len(tokens_b):
-      tokens_a.pop()
-    else:
-      tokens_b.pop()
-
-
-def generate_tf_record_from_data_file(processor,
-                                      data_dir,
-                                      tokenizer,
-                                      train_data_output_path=None,
-                                      eval_data_output_path=None,
-                                      test_data_output_path=None,
-                                      max_seq_length=128):
-  """Generates and saves training data into a tf record file.
-
-  Args:
-      processor: Input processor object to be used for generating data. Subclass
-        of `DataProcessor`.
-      data_dir: Directory that contains train/eval/test data to process.
-      tokenizer: The tokenizer to be applied on the data.
-      train_data_output_path: Output to which processed tf record for training
-        will be saved.
-      eval_data_output_path: Output to which processed tf record for evaluation
-        will be saved.
-      test_data_output_path: Output to which processed tf record for testing
-        will be saved. Must be a pattern template with {} if processor has
-        language specific test data.
-      max_seq_length: Maximum sequence length of the to be generated
-        training/eval data.
-
-  Returns:
-      A dictionary containing input meta data.
-  """
-  assert train_data_output_path or eval_data_output_path
-
-  label_list = processor.get_labels()
-  label_type = getattr(processor, "label_type", None)
-  is_regression = getattr(processor, "is_regression", False)
-  has_sample_weights = getattr(processor, "weight_key", False)
-
-  num_training_data = 0
-  if train_data_output_path:
-    train_input_data_examples = processor.get_train_examples(data_dir)
-    file_based_convert_examples_to_features(train_input_data_examples,
-                                            label_list, max_seq_length,
-                                            tokenizer, train_data_output_path,
-                                            label_type)
-    num_training_data = len(train_input_data_examples)
-
-  if eval_data_output_path:
-    eval_input_data_examples = processor.get_dev_examples(data_dir)
-    file_based_convert_examples_to_features(eval_input_data_examples,
-                                            label_list, max_seq_length,
-                                            tokenizer, eval_data_output_path,
-                                            label_type)
-
-  meta_data = {
-      "processor_type": processor.get_processor_name(),
-      "train_data_size": num_training_data,
-      "max_seq_length": max_seq_length,
-  }
-
-  if test_data_output_path:
-    test_input_data_examples = processor.get_test_examples(data_dir)
-    if isinstance(test_input_data_examples, dict):
-      for language, examples in test_input_data_examples.items():
-        file_based_convert_examples_to_features(
-            examples, label_list, max_seq_length, tokenizer,
-            test_data_output_path.format(language), label_type)
-        meta_data["test_{}_data_size".format(language)] = len(examples)
-    else:
-      file_based_convert_examples_to_features(test_input_data_examples,
-                                              label_list, max_seq_length,
-                                              tokenizer, test_data_output_path,
-                                              label_type)
-      meta_data["test_data_size"] = len(test_input_data_examples)
-
-  if is_regression:
-    meta_data["task_type"] = "bert_regression"
-    meta_data["label_type"] = {int: "int", float: "float"}[label_type]
-  else:
-    meta_data["task_type"] = "bert_classification"
-    meta_data["num_labels"] = len(processor.get_labels())
-  if has_sample_weights:
-    meta_data["has_sample_weights"] = True
-
-  if eval_data_output_path:
-    meta_data["eval_data_size"] = len(eval_input_data_examples)
-
-  return meta_data
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_finetuning_data.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_finetuning_data.py
deleted file mode 100644
index 246c1ad219b9e83ccb2f289dfe1d2f788a40aec9..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_finetuning_data.py
+++ /dev/null
@@ -1,434 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""BERT finetuning task dataset generator."""
-
-import functools
-import json
-import os
-
-# Import libraries
-from absl import app
-from absl import flags
-import tensorflow as tf
-from official.nlp.bert import tokenization
-from official.nlp.data import classifier_data_lib
-from official.nlp.data import sentence_retrieval_lib
-# word-piece tokenizer based squad_lib
-from official.nlp.data import squad_lib as squad_lib_wp
-# sentence-piece tokenizer based squad_lib
-from official.nlp.data import squad_lib_sp
-from official.nlp.data import tagging_data_lib
-
-FLAGS = flags.FLAGS
-
-# TODO(chendouble): consider moving each task to its own binary.
-flags.DEFINE_enum(
-    "fine_tuning_task_type", "classification",
-    ["classification", "regression", "squad", "retrieval", "tagging"],
-    "The name of the BERT fine tuning task for which data "
-    "will be generated.")
-
-# BERT classification specific flags.
-flags.DEFINE_string(
-    "input_data_dir", None,
-    "The input data dir. Should contain the .tsv files (or other data files) "
-    "for the task.")
-
-flags.DEFINE_enum(
-    "classification_task_name", "MNLI", [
-        "AX", "COLA", "IMDB", "MNLI", "MRPC", "PAWS-X", "QNLI", "QQP", "RTE",
-        "SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI", "XTREME-PAWS-X",
-        "AX-g", "SUPERGLUE-RTE"
-    ], "The name of the task to train BERT classifier. The "
-    "difference between XTREME-XNLI and XNLI is: 1. the format "
-    "of input tsv files; 2. the dev set for XTREME is english "
-    "only and for XNLI is all languages combined. Same for "
-    "PAWS-X.")
-
-# MNLI task-specific flag.
-flags.DEFINE_enum("mnli_type", "matched", ["matched", "mismatched"],
-                  "The type of MNLI dataset.")
-
-# XNLI task-specific flag.
-flags.DEFINE_string(
-    "xnli_language", "en",
-    "Language of training data for XNLI task. If the value is 'all', the data "
-    "of all languages will be used for training.")
-
-# PAWS-X task-specific flag.
-flags.DEFINE_string(
-    "pawsx_language", "en",
-    "Language of training data for PAWS-X task. If the value is 'all', the data "
-    "of all languages will be used for training.")
-
-# XTREME classification specific flags. Only used in XtremePawsx and XtremeXnli.
-flags.DEFINE_string(
-    "translated_input_data_dir", None,
-    "The translated input data dir. Should contain the .tsv files (or other "
-    "data files) for the task.")
-
-# Retrieval task-specific flags.
-flags.DEFINE_enum("retrieval_task_name", "bucc", ["bucc", "tatoeba"],
-                  "The name of sentence retrieval task for scoring")
-
-# Tagging task-specific flags.
-flags.DEFINE_enum("tagging_task_name", "panx", ["panx", "udpos"],
-                  "The name of BERT tagging (token classification) task.")
-
-flags.DEFINE_bool("tagging_only_use_en_train", True,
-                  "Whether only use english training data in tagging.")
-
-# BERT Squad task-specific flags.
-flags.DEFINE_string(
-    "squad_data_file", None,
-    "The input data file in for generating training data for BERT squad task.")
-
-flags.DEFINE_string(
-    "translated_squad_data_folder", None,
-    "The translated data folder for generating training data for BERT squad "
-    "task.")
-
-flags.DEFINE_integer(
-    "doc_stride", 128,
-    "When splitting up a long document into chunks, how much stride to "
-    "take between chunks.")
-
-flags.DEFINE_integer(
-    "max_query_length", 64,
-    "The maximum number of tokens for the question. Questions longer than "
-    "this will be truncated to this length.")
-
-flags.DEFINE_bool(
-    "version_2_with_negative", False,
-    "If true, the SQuAD examples contain some that do not have an answer.")
-
-flags.DEFINE_bool(
-    "xlnet_format", False,
-    "If true, then data will be preprocessed in a paragraph, query, class order"
-    " instead of the BERT-style class, paragraph, query order.")
-
-# XTREME specific flags.
-flags.DEFINE_bool("only_use_en_dev", True, "Whether only use english dev data.")
-
-# Shared flags across BERT fine-tuning tasks.
-flags.DEFINE_string("vocab_file", None,
-                    "The vocabulary file that the BERT model was trained on.")
-
-flags.DEFINE_string(
-    "train_data_output_path", None,
-    "The path in which generated training input data will be written as tf"
-    " records.")
-
-flags.DEFINE_string(
-    "eval_data_output_path", None,
-    "The path in which generated evaluation input data will be written as tf"
-    " records.")
-
-flags.DEFINE_string(
-    "test_data_output_path", None,
-    "The path in which generated test input data will be written as tf"
-    " records. If None, do not generate test data. Must be a pattern template"
-    " as test_{}.tfrecords if processor has language specific test data.")
-
-flags.DEFINE_string("meta_data_file_path", None,
-                    "The path in which input meta data will be written.")
-
-flags.DEFINE_bool(
-    "do_lower_case", True,
-    "Whether to lower case the input text. Should be True for uncased "
-    "models and False for cased models.")
-
-flags.DEFINE_integer(
-    "max_seq_length", 128,
-    "The maximum total input sequence length after WordPiece tokenization. "
-    "Sequences longer than this will be truncated, and sequences shorter "
-    "than this will be padded.")
-
-flags.DEFINE_string("sp_model_file", "",
-                    "The path to the model used by sentence piece tokenizer.")
-
-flags.DEFINE_enum(
-    "tokenization", "WordPiece", ["WordPiece", "SentencePiece"],
-    "Specifies the tokenizer implementation, i.e., whether to use WordPiece "
-    "or SentencePiece tokenizer. Canonical BERT uses WordPiece tokenizer, "
-    "while ALBERT uses SentencePiece tokenizer.")
-
-flags.DEFINE_string(
-    "tfds_params", "", "Comma-separated list of TFDS parameter assigments for "
-    "generic classfication data import (for more details "
-    "see the TfdsProcessor class documentation).")
-
-
-def generate_classifier_dataset():
-  """Generates classifier dataset and returns input meta data."""
-  assert (FLAGS.input_data_dir and FLAGS.classification_task_name or
-          FLAGS.tfds_params)
-
-  if FLAGS.tokenization == "WordPiece":
-    tokenizer = tokenization.FullTokenizer(
-        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
-    processor_text_fn = tokenization.convert_to_unicode
-  else:
-    assert FLAGS.tokenization == "SentencePiece"
-    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
-    processor_text_fn = functools.partial(
-        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
-
-  if FLAGS.tfds_params:
-    processor = classifier_data_lib.TfdsProcessor(
-        tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn)
-    return classifier_data_lib.generate_tf_record_from_data_file(
-        processor,
-        None,
-        tokenizer,
-        train_data_output_path=FLAGS.train_data_output_path,
-        eval_data_output_path=FLAGS.eval_data_output_path,
-        test_data_output_path=FLAGS.test_data_output_path,
-        max_seq_length=FLAGS.max_seq_length)
-  else:
-    processors = {
-        "ax":
-            classifier_data_lib.AxProcessor,
-        "cola":
-            classifier_data_lib.ColaProcessor,
-        "imdb":
-            classifier_data_lib.ImdbProcessor,
-        "mnli":
-            functools.partial(
-                classifier_data_lib.MnliProcessor, mnli_type=FLAGS.mnli_type),
-        "mrpc":
-            classifier_data_lib.MrpcProcessor,
-        "qnli":
-            classifier_data_lib.QnliProcessor,
-        "qqp":
-            classifier_data_lib.QqpProcessor,
-        "rte":
-            classifier_data_lib.RteProcessor,
-        "sst-2":
-            classifier_data_lib.SstProcessor,
-        "sts-b":
-            classifier_data_lib.StsBProcessor,
-        "xnli":
-            functools.partial(
-                classifier_data_lib.XnliProcessor,
-                language=FLAGS.xnli_language),
-        "paws-x":
-            functools.partial(
-                classifier_data_lib.PawsxProcessor,
-                language=FLAGS.pawsx_language),
-        "wnli":
-            classifier_data_lib.WnliProcessor,
-        "xtreme-xnli":
-            functools.partial(
-                classifier_data_lib.XtremeXnliProcessor,
-                translated_data_dir=FLAGS.translated_input_data_dir,
-                only_use_en_dev=FLAGS.only_use_en_dev),
-        "xtreme-paws-x":
-            functools.partial(
-                classifier_data_lib.XtremePawsxProcessor,
-                translated_data_dir=FLAGS.translated_input_data_dir,
-                only_use_en_dev=FLAGS.only_use_en_dev),
-        "ax-g":
-            classifier_data_lib.AXgProcessor,
-        "superglue-rte":
-            classifier_data_lib.SuperGLUERTEProcessor
-    }
-    task_name = FLAGS.classification_task_name.lower()
-    if task_name not in processors:
-      raise ValueError("Task not found: %s" % (task_name))
-
-    processor = processors[task_name](process_text_fn=processor_text_fn)
-    return classifier_data_lib.generate_tf_record_from_data_file(
-        processor,
-        FLAGS.input_data_dir,
-        tokenizer,
-        train_data_output_path=FLAGS.train_data_output_path,
-        eval_data_output_path=FLAGS.eval_data_output_path,
-        test_data_output_path=FLAGS.test_data_output_path,
-        max_seq_length=FLAGS.max_seq_length)
-
-
-def generate_regression_dataset():
-  """Generates regression dataset and returns input meta data."""
-  if FLAGS.tokenization == "WordPiece":
-    tokenizer = tokenization.FullTokenizer(
-        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
-    processor_text_fn = tokenization.convert_to_unicode
-  else:
-    assert FLAGS.tokenization == "SentencePiece"
-    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
-    processor_text_fn = functools.partial(
-        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
-
-  if FLAGS.tfds_params:
-    processor = classifier_data_lib.TfdsProcessor(
-        tfds_params=FLAGS.tfds_params, process_text_fn=processor_text_fn)
-    return classifier_data_lib.generate_tf_record_from_data_file(
-        processor,
-        None,
-        tokenizer,
-        train_data_output_path=FLAGS.train_data_output_path,
-        eval_data_output_path=FLAGS.eval_data_output_path,
-        test_data_output_path=FLAGS.test_data_output_path,
-        max_seq_length=FLAGS.max_seq_length)
-  else:
-    raise ValueError("No data processor found for the given regression task.")
-
-
-def generate_squad_dataset():
-  """Generates squad training dataset and returns input meta data."""
-  assert FLAGS.squad_data_file
-  if FLAGS.tokenization == "WordPiece":
-    return squad_lib_wp.generate_tf_record_from_json_file(
-        input_file_path=FLAGS.squad_data_file,
-        vocab_file_path=FLAGS.vocab_file,
-        output_path=FLAGS.train_data_output_path,
-        translated_input_folder=FLAGS.translated_squad_data_folder,
-        max_seq_length=FLAGS.max_seq_length,
-        do_lower_case=FLAGS.do_lower_case,
-        max_query_length=FLAGS.max_query_length,
-        doc_stride=FLAGS.doc_stride,
-        version_2_with_negative=FLAGS.version_2_with_negative,
-        xlnet_format=FLAGS.xlnet_format)
-  else:
-    assert FLAGS.tokenization == "SentencePiece"
-    return squad_lib_sp.generate_tf_record_from_json_file(
-        input_file_path=FLAGS.squad_data_file,
-        sp_model_file=FLAGS.sp_model_file,
-        output_path=FLAGS.train_data_output_path,
-        translated_input_folder=FLAGS.translated_squad_data_folder,
-        max_seq_length=FLAGS.max_seq_length,
-        do_lower_case=FLAGS.do_lower_case,
-        max_query_length=FLAGS.max_query_length,
-        doc_stride=FLAGS.doc_stride,
-        xlnet_format=FLAGS.xlnet_format,
-        version_2_with_negative=FLAGS.version_2_with_negative)
-
-
-def generate_retrieval_dataset():
-  """Generate retrieval test and dev dataset and returns input meta data."""
-  assert (FLAGS.input_data_dir and FLAGS.retrieval_task_name)
-  if FLAGS.tokenization == "WordPiece":
-    tokenizer = tokenization.FullTokenizer(
-        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
-    processor_text_fn = tokenization.convert_to_unicode
-  else:
-    assert FLAGS.tokenization == "SentencePiece"
-    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
-    processor_text_fn = functools.partial(
-        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
-
-  processors = {
-      "bucc": sentence_retrieval_lib.BuccProcessor,
-      "tatoeba": sentence_retrieval_lib.TatoebaProcessor,
-  }
-
-  task_name = FLAGS.retrieval_task_name.lower()
-  if task_name not in processors:
-    raise ValueError("Task not found: %s" % task_name)
-
-  processor = processors[task_name](process_text_fn=processor_text_fn)
-
-  return sentence_retrieval_lib.generate_sentence_retrevial_tf_record(
-      processor, FLAGS.input_data_dir, tokenizer, FLAGS.eval_data_output_path,
-      FLAGS.test_data_output_path, FLAGS.max_seq_length)
-
-
-def generate_tagging_dataset():
-  """Generates tagging dataset."""
-  processors = {
-      "panx":
-          functools.partial(
-              tagging_data_lib.PanxProcessor,
-              only_use_en_train=FLAGS.tagging_only_use_en_train,
-              only_use_en_dev=FLAGS.only_use_en_dev),
-      "udpos":
-          functools.partial(
-              tagging_data_lib.UdposProcessor,
-              only_use_en_train=FLAGS.tagging_only_use_en_train,
-              only_use_en_dev=FLAGS.only_use_en_dev),
-  }
-  task_name = FLAGS.tagging_task_name.lower()
-  if task_name not in processors:
-    raise ValueError("Task not found: %s" % task_name)
-
-  if FLAGS.tokenization == "WordPiece":
-    tokenizer = tokenization.FullTokenizer(
-        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
-    processor_text_fn = tokenization.convert_to_unicode
-  elif FLAGS.tokenization == "SentencePiece":
-    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
-    processor_text_fn = functools.partial(
-        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
-  else:
-    raise ValueError("Unsupported tokenization: %s" % FLAGS.tokenization)
-
-  processor = processors[task_name]()
-  return tagging_data_lib.generate_tf_record_from_data_file(
-      processor, FLAGS.input_data_dir, tokenizer, FLAGS.max_seq_length,
-      FLAGS.train_data_output_path, FLAGS.eval_data_output_path,
-      FLAGS.test_data_output_path, processor_text_fn)
-
-
-def main(_):
-  if FLAGS.tokenization == "WordPiece":
-    if not FLAGS.vocab_file:
-      raise ValueError(
-          "FLAG vocab_file for word-piece tokenizer is not specified.")
-  else:
-    assert FLAGS.tokenization == "SentencePiece"
-    if not FLAGS.sp_model_file:
-      raise ValueError(
-          "FLAG sp_model_file for sentence-piece tokenizer is not specified.")
-
-  if FLAGS.fine_tuning_task_type != "retrieval":
-    flags.mark_flag_as_required("train_data_output_path")
-
-  if FLAGS.fine_tuning_task_type == "classification":
-    input_meta_data = generate_classifier_dataset()
-  elif FLAGS.fine_tuning_task_type == "regression":
-    input_meta_data = generate_regression_dataset()
-  elif FLAGS.fine_tuning_task_type == "retrieval":
-    input_meta_data = generate_retrieval_dataset()
-  elif FLAGS.fine_tuning_task_type == "squad":
-    input_meta_data = generate_squad_dataset()
-  else:
-    assert FLAGS.fine_tuning_task_type == "tagging"
-    input_meta_data = generate_tagging_dataset()
-
-  tf.io.gfile.makedirs(os.path.dirname(FLAGS.meta_data_file_path))
-  with tf.io.gfile.GFile(FLAGS.meta_data_file_path, "w") as writer:
-    writer.write(json.dumps(input_meta_data, indent=4) + "\n")
-
-
-if __name__ == "__main__":
-  flags.mark_flag_as_required("meta_data_file_path")
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_pretraining_data.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_pretraining_data.py
deleted file mode 100644
index e7086bdcb9236ee25a420083beb5d9ec45dbcc88..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_pretraining_data.py
+++ /dev/null
@@ -1,685 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Create masked LM/next sentence masked_lm TF examples for BERT."""
-
-import collections
-import itertools
-import random
-
-# Import libraries
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-
-from official.nlp.bert import tokenization
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("input_file", None,
-                    "Input raw text file (or comma-separated list of files).")
-
-flags.DEFINE_string(
-    "output_file", None,
-    "Output TF example file (or comma-separated list of files).")
-
-flags.DEFINE_string("vocab_file", None,
-                    "The vocabulary file that the BERT model was trained on.")
-
-flags.DEFINE_bool(
-    "do_lower_case", True,
-    "Whether to lower case the input text. Should be True for uncased "
-    "models and False for cased models.")
-
-flags.DEFINE_bool(
-    "do_whole_word_mask", False,
-    "Whether to use whole word masking rather than per-WordPiece masking.")
-
-flags.DEFINE_integer(
-    "max_ngram_size", None,
-    "Mask contiguous whole words (n-grams) of up to `max_ngram_size` using a "
-    "weighting scheme to favor shorter n-grams. "
-    "Note: `--do_whole_word_mask=True` must also be set when n-gram masking.")
-
-flags.DEFINE_bool(
-    "gzip_compress", False,
-    "Whether to use `GZIP` compress option to get compressed TFRecord files.")
-
-flags.DEFINE_bool(
-    "use_v2_feature_names", False,
-    "Whether to use the feature names consistent with the models.")
-
-flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
-
-flags.DEFINE_integer("max_predictions_per_seq", 20,
-                     "Maximum number of masked LM predictions per sequence.")
-
-flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
-
-flags.DEFINE_integer(
-    "dupe_factor", 10,
-    "Number of times to duplicate the input data (with different masks).")
-
-flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
-
-flags.DEFINE_float(
-    "short_seq_prob", 0.1,
-    "Probability of creating sequences which are shorter than the "
-    "maximum length.")
-
-
-class TrainingInstance(object):
-  """A single training instance (sentence pair)."""
-
-  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
-               is_random_next):
-    self.tokens = tokens
-    self.segment_ids = segment_ids
-    self.is_random_next = is_random_next
-    self.masked_lm_positions = masked_lm_positions
-    self.masked_lm_labels = masked_lm_labels
-
-  def __str__(self):
-    s = ""
-    s += "tokens: %s\n" % (" ".join(
-        [tokenization.printable_text(x) for x in self.tokens]))
-    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
-    s += "is_random_next: %s\n" % self.is_random_next
-    s += "masked_lm_positions: %s\n" % (" ".join(
-        [str(x) for x in self.masked_lm_positions]))
-    s += "masked_lm_labels: %s\n" % (" ".join(
-        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
-    s += "\n"
-    return s
-
-  def __repr__(self):
-    return self.__str__()
-
-
-def write_instance_to_example_files(instances, tokenizer, max_seq_length,
-                                    max_predictions_per_seq, output_files,
-                                    gzip_compress, use_v2_feature_names):
-  """Creates TF example files from `TrainingInstance`s."""
-  writers = []
-  for output_file in output_files:
-    writers.append(
-        tf.io.TFRecordWriter(
-            output_file, options="GZIP" if gzip_compress else ""))
-
-  writer_index = 0
-
-  total_written = 0
-  for (inst_index, instance) in enumerate(instances):
-    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
-    input_mask = [1] * len(input_ids)
-    segment_ids = list(instance.segment_ids)
-    assert len(input_ids) <= max_seq_length
-
-    while len(input_ids) < max_seq_length:
-      input_ids.append(0)
-      input_mask.append(0)
-      segment_ids.append(0)
-
-    assert len(input_ids) == max_seq_length
-    assert len(input_mask) == max_seq_length
-    assert len(segment_ids) == max_seq_length
-
-    masked_lm_positions = list(instance.masked_lm_positions)
-    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
-    masked_lm_weights = [1.0] * len(masked_lm_ids)
-
-    while len(masked_lm_positions) < max_predictions_per_seq:
-      masked_lm_positions.append(0)
-      masked_lm_ids.append(0)
-      masked_lm_weights.append(0.0)
-
-    next_sentence_label = 1 if instance.is_random_next else 0
-
-    features = collections.OrderedDict()
-    if use_v2_feature_names:
-      features["input_word_ids"] = create_int_feature(input_ids)
-      features["input_type_ids"] = create_int_feature(segment_ids)
-    else:
-      features["input_ids"] = create_int_feature(input_ids)
-      features["segment_ids"] = create_int_feature(segment_ids)
-
-    features["input_mask"] = create_int_feature(input_mask)
-    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
-    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
-    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
-    features["next_sentence_labels"] = create_int_feature([next_sentence_label])
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-
-    writers[writer_index].write(tf_example.SerializeToString())
-    writer_index = (writer_index + 1) % len(writers)
-
-    total_written += 1
-
-    if inst_index < 20:
-      logging.info("*** Example ***")
-      logging.info("tokens: %s", " ".join(
-          [tokenization.printable_text(x) for x in instance.tokens]))
-
-      for feature_name in features.keys():
-        feature = features[feature_name]
-        values = []
-        if feature.int64_list.value:
-          values = feature.int64_list.value
-        elif feature.float_list.value:
-          values = feature.float_list.value
-        logging.info("%s: %s", feature_name, " ".join([str(x) for x in values]))
-
-  for writer in writers:
-    writer.close()
-
-  logging.info("Wrote %d total instances", total_written)
-
-
-def create_int_feature(values):
-  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-  return feature
-
-
-def create_float_feature(values):
-  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
-  return feature
-
-
-def create_training_instances(input_files,
-                              tokenizer,
-                              max_seq_length,
-                              dupe_factor,
-                              short_seq_prob,
-                              masked_lm_prob,
-                              max_predictions_per_seq,
-                              rng,
-                              do_whole_word_mask=False,
-                              max_ngram_size=None):
-  """Create `TrainingInstance`s from raw text."""
-  all_documents = [[]]
-
-  # Input file format:
-  # (1) One sentence per line. These should ideally be actual sentences, not
-  # entire paragraphs or arbitrary spans of text. (Because we use the
-  # sentence boundaries for the "next sentence prediction" task).
-  # (2) Blank lines between documents. Document boundaries are needed so
-  # that the "next sentence prediction" task doesn't span between documents.
-  for input_file in input_files:
-    with tf.io.gfile.GFile(input_file, "rb") as reader:
-      while True:
-        line = tokenization.convert_to_unicode(reader.readline())
-        if not line:
-          break
-        line = line.strip()
-
-        # Empty lines are used as document delimiters
-        if not line:
-          all_documents.append([])
-        tokens = tokenizer.tokenize(line)
-        if tokens:
-          all_documents[-1].append(tokens)
-
-  # Remove empty documents
-  all_documents = [x for x in all_documents if x]
-  rng.shuffle(all_documents)
-
-  vocab_words = list(tokenizer.vocab.keys())
-  instances = []
-  for _ in range(dupe_factor):
-    for document_index in range(len(all_documents)):
-      instances.extend(
-          create_instances_from_document(
-              all_documents, document_index, max_seq_length, short_seq_prob,
-              masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
-              do_whole_word_mask, max_ngram_size))
-
-  rng.shuffle(instances)
-  return instances
-
-
-def create_instances_from_document(
-    all_documents, document_index, max_seq_length, short_seq_prob,
-    masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
-    do_whole_word_mask=False,
-    max_ngram_size=None):
-  """Creates `TrainingInstance`s for a single document."""
-  document = all_documents[document_index]
-
-  # Account for [CLS], [SEP], [SEP]
-  max_num_tokens = max_seq_length - 3
-
-  # We *usually* want to fill up the entire sequence since we are padding
-  # to `max_seq_length` anyways, so short sequences are generally wasted
-  # computation. However, we *sometimes*
-  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
-  # sequences to minimize the mismatch between pre-training and fine-tuning.
-  # The `target_seq_length` is just a rough target however, whereas
-  # `max_seq_length` is a hard limit.
-  target_seq_length = max_num_tokens
-  if rng.random() < short_seq_prob:
-    target_seq_length = rng.randint(2, max_num_tokens)
-
-  # We DON'T just concatenate all of the tokens from a document into a long
-  # sequence and choose an arbitrary split point because this would make the
-  # next sentence prediction task too easy. Instead, we split the input into
-  # segments "A" and "B" based on the actual "sentences" provided by the user
-  # input.
-  instances = []
-  current_chunk = []
-  current_length = 0
-  i = 0
-  while i < len(document):
-    segment = document[i]
-    current_chunk.append(segment)
-    current_length += len(segment)
-    if i == len(document) - 1 or current_length >= target_seq_length:
-      if current_chunk:
-        # `a_end` is how many segments from `current_chunk` go into the `A`
-        # (first) sentence.
-        a_end = 1
-        if len(current_chunk) >= 2:
-          a_end = rng.randint(1, len(current_chunk) - 1)
-
-        tokens_a = []
-        for j in range(a_end):
-          tokens_a.extend(current_chunk[j])
-
-        tokens_b = []
-        # Random next
-        is_random_next = False
-        if len(current_chunk) == 1 or rng.random() < 0.5:
-          is_random_next = True
-          target_b_length = target_seq_length - len(tokens_a)
-
-          # This should rarely go for more than one iteration for large
-          # corpora. However, just to be careful, we try to make sure that
-          # the random document is not the same as the document
-          # we're processing.
-          for _ in range(10):
-            random_document_index = rng.randint(0, len(all_documents) - 1)
-            if random_document_index != document_index:
-              break
-
-          random_document = all_documents[random_document_index]
-          random_start = rng.randint(0, len(random_document) - 1)
-          for j in range(random_start, len(random_document)):
-            tokens_b.extend(random_document[j])
-            if len(tokens_b) >= target_b_length:
-              break
-          # We didn't actually use these segments so we "put them back" so
-          # they don't go to waste.
-          num_unused_segments = len(current_chunk) - a_end
-          i -= num_unused_segments
-        # Actual next
-        else:
-          is_random_next = False
-          for j in range(a_end, len(current_chunk)):
-            tokens_b.extend(current_chunk[j])
-        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
-
-        assert len(tokens_a) >= 1
-        assert len(tokens_b) >= 1
-
-        tokens = []
-        segment_ids = []
-        tokens.append("[CLS]")
-        segment_ids.append(0)
-        for token in tokens_a:
-          tokens.append(token)
-          segment_ids.append(0)
-
-        tokens.append("[SEP]")
-        segment_ids.append(0)
-
-        for token in tokens_b:
-          tokens.append(token)
-          segment_ids.append(1)
-        tokens.append("[SEP]")
-        segment_ids.append(1)
-
-        (tokens, masked_lm_positions,
-         masked_lm_labels) = create_masked_lm_predictions(
-             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
-             do_whole_word_mask, max_ngram_size)
-        instance = TrainingInstance(
-            tokens=tokens,
-            segment_ids=segment_ids,
-            is_random_next=is_random_next,
-            masked_lm_positions=masked_lm_positions,
-            masked_lm_labels=masked_lm_labels)
-        instances.append(instance)
-      current_chunk = []
-      current_length = 0
-    i += 1
-
-  return instances
-
-
-MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
-                                          ["index", "label"])
-
-# A _Gram is a [half-open) interval of token indices which form a word.
-# E.g.,
-#   words:  ["The", "doghouse"]
-#   tokens: ["The", "dog", "##house"]
-#   grams:  [(0,1), (1,3)]
-_Gram = collections.namedtuple("_Gram", ["begin", "end"])
-
-
-def _window(iterable, size):
-  """Helper to create a sliding window iterator with a given size.
-
-  E.g.,
-    input = [1, 2, 3, 4]
-    _window(input, 1) => [1], [2], [3], [4]
-    _window(input, 2) => [1, 2], [2, 3], [3, 4]
-    _window(input, 3) => [1, 2, 3], [2, 3, 4]
-    _window(input, 4) => [1, 2, 3, 4]
-    _window(input, 5) => None
-
-  Args:
-    iterable: elements to iterate over.
-    size: size of the window.
-
-  Yields:
-    Elements of `iterable` batched into a sliding window of length `size`.
-  """
-  i = iter(iterable)
-  window = []
-  try:
-    for e in range(0, size):
-      window.append(next(i))
-    yield window
-  except StopIteration:
-    # handle the case where iterable's length is less than the window size.
-    return
-  for e in i:
-    window = window[1:] + [e]
-    yield window
-
-
-def _contiguous(sorted_grams):
-  """Test whether a sequence of grams is contiguous.
-
-  Args:
-    sorted_grams: _Grams which are sorted in increasing order.
-  Returns:
-    True if `sorted_grams` are touching each other.
-
-  E.g.,
-    _contiguous([(1, 4), (4, 5), (5, 10)]) == True
-    _contiguous([(1, 2), (4, 5)]) == False
-  """
-  for a, b in _window(sorted_grams, 2):
-    if a.end != b.begin:
-      return False
-  return True
-
-
-def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
-  """Create a list of masking {1, ..., n}-grams from a list of one-grams.
-
-  This is an extention of 'whole word masking' to mask multiple, contiguous
-  words such as (e.g., "the red boat").
-
-  Each input gram represents the token indices of a single word,
-     words:  ["the", "red", "boat"]
-     tokens: ["the", "red", "boa", "##t"]
-     grams:  [(0,1), (1,2), (2,4)]
-
-  For a `max_ngram_size` of three, possible outputs masks include:
-    1-grams: (0,1), (1,2), (2,4)
-    2-grams: (0,2), (1,4)
-    3-grams; (0,4)
-
-  Output masks will not overlap and contain less than `max_masked_tokens` total
-  tokens.  E.g., for the example above with `max_masked_tokens` as three,
-  valid outputs are,
-       [(0,1), (1,2)]  # "the", "red" covering two tokens
-       [(1,2), (2,4)]  # "red", "boa", "##t" covering three tokens
-
-  The length of the selected n-gram follows a zipf weighting to
-  favor shorter n-gram sizes (weight(1)=1, weight(2)=1/2, weight(3)=1/3, ...).
-
-  Args:
-    grams: List of one-grams.
-    max_ngram_size: Maximum number of contiguous one-grams combined to create
-      an n-gram.
-    max_masked_tokens: Maximum total number of tokens to be masked.
-    rng: `random.Random` generator.
-
-  Returns:
-    A list of n-grams to be used as masks.
-  """
-  if not grams:
-    return None
-
-  grams = sorted(grams)
-  num_tokens = grams[-1].end
-
-  # Ensure our grams are valid (i.e., they don't overlap).
-  for a, b in _window(grams, 2):
-    if a.end > b.begin:
-      raise ValueError("overlapping grams: {}".format(grams))
-
-  # Build map from n-gram length to list of n-grams.
-  ngrams = {i: [] for i in range(1, max_ngram_size+1)}
-  for gram_size in range(1, max_ngram_size+1):
-    for g in _window(grams, gram_size):
-      if _contiguous(g):
-        # Add an n-gram which spans these one-grams.
-        ngrams[gram_size].append(_Gram(g[0].begin, g[-1].end))
-
-  # Shuffle each list of n-grams.
-  for v in ngrams.values():
-    rng.shuffle(v)
-
-  # Create the weighting for n-gram length selection.
-  # Stored cummulatively for `random.choices` below.
-  cummulative_weights = list(
-      itertools.accumulate([1./n for n in range(1, max_ngram_size+1)]))
-
-  output_ngrams = []
-  # Keep a bitmask of which tokens have been masked.
-  masked_tokens = [False] * num_tokens
-  # Loop until we have enough masked tokens or there are no more candidate
-  # n-grams of any length.
-  # Each code path should ensure one or more elements from `ngrams` are removed
-  # to guarentee this loop terminates.
-  while (sum(masked_tokens) < max_masked_tokens and
-         sum(len(s) for s in ngrams.values())):
-    # Pick an n-gram size based on our weights.
-    sz = random.choices(range(1, max_ngram_size+1),
-                        cum_weights=cummulative_weights)[0]
-
-    # Ensure this size doesn't result in too many masked tokens.
-    # E.g., a two-gram contains _at least_ two tokens.
-    if sum(masked_tokens) + sz > max_masked_tokens:
-      # All n-grams of this length are too long and can be removed from
-      # consideration.
-      ngrams[sz].clear()
-      continue
-
-    # All of the n-grams of this size have been used.
-    if not ngrams[sz]:
-      continue
-
-    # Choose a random n-gram of the given size.
-    gram = ngrams[sz].pop()
-    num_gram_tokens = gram.end-gram.begin
-
-    # Check if this would add too many tokens.
-    if num_gram_tokens + sum(masked_tokens) > max_masked_tokens:
-      continue
-
-    # Check if any of the tokens in this gram have already been masked.
-    if sum(masked_tokens[gram.begin:gram.end]):
-      continue
-
-    # Found a usable n-gram!  Mark its tokens as masked and add it to return.
-    masked_tokens[gram.begin:gram.end] = [True] * (gram.end-gram.begin)
-    output_ngrams.append(gram)
-  return output_ngrams
-
-
-def _wordpieces_to_grams(tokens):
-  """Reconstitue grams (words) from `tokens`.
-
-  E.g.,
-     tokens: ['[CLS]', 'That', 'lit', '##tle', 'blue', 'tru', '##ck', '[SEP]']
-      grams: [          [1,2), [2,         4),  [4,5) , [5,       6)]
-
-  Args:
-    tokens: list of wordpieces
-  Returns:
-    List of _Grams representing spans of whole words
-    (without "[CLS]" and "[SEP]").
-  """
-  grams = []
-  gram_start_pos = None
-  for i, token in enumerate(tokens):
-    if gram_start_pos is not None and token.startswith("##"):
-      continue
-    if gram_start_pos is not None:
-      grams.append(_Gram(gram_start_pos, i))
-    if token not in ["[CLS]", "[SEP]"]:
-      gram_start_pos = i
-    else:
-      gram_start_pos = None
-  if gram_start_pos is not None:
-    grams.append(_Gram(gram_start_pos, len(tokens)))
-  return grams
-
-
-def create_masked_lm_predictions(tokens, masked_lm_prob,
-                                 max_predictions_per_seq, vocab_words, rng,
-                                 do_whole_word_mask,
-                                 max_ngram_size=None):
-  """Creates the predictions for the masked LM objective."""
-  if do_whole_word_mask:
-    grams = _wordpieces_to_grams(tokens)
-  else:
-    # Here we consider each token to be a word to allow for sub-word masking.
-    if max_ngram_size:
-      raise ValueError("cannot use ngram masking without whole word masking")
-    grams = [_Gram(i, i+1) for i in range(0, len(tokens))
-             if tokens[i] not in ["[CLS]", "[SEP]"]]
-
-  num_to_predict = min(max_predictions_per_seq,
-                       max(1, int(round(len(tokens) * masked_lm_prob))))
-  # Generate masks.  If `max_ngram_size` in [0, None] it means we're doing
-  # whole word masking or token level masking.  Both of these can be treated
-  # as the `max_ngram_size=1` case.
-  masked_grams = _masking_ngrams(grams, max_ngram_size or 1,
-                                 num_to_predict, rng)
-  masked_lms = []
-  output_tokens = list(tokens)
-  for gram in masked_grams:
-    # 80% of the time, replace all n-gram tokens with [MASK]
-    if rng.random() < 0.8:
-      replacement_action = lambda idx: "[MASK]"
-    else:
-      # 10% of the time, keep all the original n-gram tokens.
-      if rng.random() < 0.5:
-        replacement_action = lambda idx: tokens[idx]
-      # 10% of the time, replace each n-gram token with a random word.
-      else:
-        replacement_action = lambda idx: rng.choice(vocab_words)
-
-    for idx in range(gram.begin, gram.end):
-      output_tokens[idx] = replacement_action(idx)
-      masked_lms.append(MaskedLmInstance(index=idx, label=tokens[idx]))
-
-  assert len(masked_lms) <= num_to_predict
-  masked_lms = sorted(masked_lms, key=lambda x: x.index)
-
-  masked_lm_positions = []
-  masked_lm_labels = []
-  for p in masked_lms:
-    masked_lm_positions.append(p.index)
-    masked_lm_labels.append(p.label)
-
-  return (output_tokens, masked_lm_positions, masked_lm_labels)
-
-
-def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
-  """Truncates a pair of sequences to a maximum sequence length."""
-  while True:
-    total_length = len(tokens_a) + len(tokens_b)
-    if total_length <= max_num_tokens:
-      break
-
-    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
-    assert len(trunc_tokens) >= 1
-
-    # We want to sometimes truncate from the front and sometimes from the
-    # back to add more randomness and avoid biases.
-    if rng.random() < 0.5:
-      del trunc_tokens[0]
-    else:
-      trunc_tokens.pop()
-
-
-def main(_):
-  tokenizer = tokenization.FullTokenizer(
-      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
-
-  input_files = []
-  for input_pattern in FLAGS.input_file.split(","):
-    input_files.extend(tf.io.gfile.glob(input_pattern))
-
-  logging.info("*** Reading from input files ***")
-  for input_file in input_files:
-    logging.info("  %s", input_file)
-
-  rng = random.Random(FLAGS.random_seed)
-  instances = create_training_instances(
-      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
-      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
-      rng, FLAGS.do_whole_word_mask, FLAGS.max_ngram_size)
-
-  output_files = FLAGS.output_file.split(",")
-  logging.info("*** Writing to output files ***")
-  for output_file in output_files:
-    logging.info("  %s", output_file)
-
-  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
-                                  FLAGS.max_predictions_per_seq, output_files,
-                                  FLAGS.gzip_compress,
-                                  FLAGS.use_v2_feature_names)
-
-
-if __name__ == "__main__":
-  flags.mark_flag_as_required("input_file")
-  flags.mark_flag_as_required("output_file")
-  flags.mark_flag_as_required("vocab_file")
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_pretraining_data_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_pretraining_data_test.py
deleted file mode 100644
index fe7093064a4914e832c0057d7c3596c24090a444..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_pretraining_data_test.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.data.create_pretraining_data."""
-import random
-
-import tensorflow as tf
-
-from official.nlp.data import create_pretraining_data as cpd
-
-_VOCAB_WORDS = ["vocab_1", "vocab_2"]
-
-
-class CreatePretrainingDataTest(tf.test.TestCase):
-
-  def assertTokens(self, input_tokens, output_tokens, masked_positions,
-                   masked_labels):
-    # Ensure the masked positions are unique.
-    self.assertCountEqual(masked_positions, set(masked_positions))
-
-    # Ensure we can reconstruct the input from the output.
-    reconstructed_tokens = output_tokens
-    for pos, label in zip(masked_positions, masked_labels):
-      reconstructed_tokens[pos] = label
-    self.assertEqual(input_tokens, reconstructed_tokens)
-
-    # Ensure each label is valid.
-    for pos, label in zip(masked_positions, masked_labels):
-      output_token = output_tokens[pos]
-      if (output_token == "[MASK]" or output_token in _VOCAB_WORDS or
-          output_token == input_tokens[pos]):
-        continue
-      self.fail("invalid mask value: {}".format(output_token))
-
-  def test_wordpieces_to_grams(self):
-    tests = [
-        (["That", "cone"], [(0, 1), (1, 2)]),
-        (["That", "cone", "##s"], [(0, 1), (1, 3)]),
-        (["Swit", "##zer", "##land"], [(0, 3)]),
-        (["[CLS]", "Up", "##dog"], [(1, 3)]),
-        (["[CLS]", "Up", "##dog", "[SEP]", "Down"], [(1, 3), (4, 5)]),
-    ]
-    for inp, expected in tests:
-      output = cpd._wordpieces_to_grams(inp)
-      self.assertEqual(expected, output)
-
-  def test_window(self):
-    input_list = [1, 2, 3, 4]
-    window_outputs = [
-        (1, [[1], [2], [3], [4]]),
-        (2, [[1, 2], [2, 3], [3, 4]]),
-        (3, [[1, 2, 3], [2, 3, 4]]),
-        (4, [[1, 2, 3, 4]]),
-        (5, []),
-    ]
-    for window, expected in window_outputs:
-      output = cpd._window(input_list, window)
-      self.assertEqual(expected, list(output))
-
-  def test_create_masked_lm_predictions(self):
-    tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"]
-    rng = random.Random(123)
-    for _ in range(0, 5):
-      output_tokens, masked_positions, masked_labels = (
-          cpd.create_masked_lm_predictions(
-              tokens=tokens,
-              masked_lm_prob=1.0,
-              max_predictions_per_seq=3,
-              vocab_words=_VOCAB_WORDS,
-              rng=rng,
-              do_whole_word_mask=False,
-              max_ngram_size=None))
-      self.assertEqual(len(masked_positions), 3)
-      self.assertEqual(len(masked_labels), 3)
-      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
-
-  def test_create_masked_lm_predictions_whole_word(self):
-    tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"]
-    rng = random.Random(345)
-    for _ in range(0, 5):
-      output_tokens, masked_positions, masked_labels = (
-          cpd.create_masked_lm_predictions(
-              tokens=tokens,
-              masked_lm_prob=1.0,
-              max_predictions_per_seq=3,
-              vocab_words=_VOCAB_WORDS,
-              rng=rng,
-              do_whole_word_mask=True,
-              max_ngram_size=None))
-      # since we can't get exactly three tokens without breaking a word we
-      # only take two.
-      self.assertEqual(len(masked_positions), 2)
-      self.assertEqual(len(masked_labels), 2)
-      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
-      # ensure that we took an entire word.
-      self.assertIn(masked_labels, [["a", "##a"], ["b", "##b"], ["c", "##c"]])
-
-  def test_create_masked_lm_predictions_ngram(self):
-    tokens = ["[CLS]"] + ["tok{}".format(i) for i in range(0, 512)] + ["[SEP]"]
-    rng = random.Random(345)
-    for _ in range(0, 5):
-      output_tokens, masked_positions, masked_labels = (
-          cpd.create_masked_lm_predictions(
-              tokens=tokens,
-              masked_lm_prob=1.0,
-              max_predictions_per_seq=76,
-              vocab_words=_VOCAB_WORDS,
-              rng=rng,
-              do_whole_word_mask=True,
-              max_ngram_size=3))
-      self.assertEqual(len(masked_positions), 76)
-      self.assertEqual(len(masked_labels), 76)
-      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_xlnet_pretraining_data.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_xlnet_pretraining_data.py
deleted file mode 100644
index 3afbffaae4656a86e8b6f609840c9bd849e1dc46..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_xlnet_pretraining_data.py
+++ /dev/null
@@ -1,737 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Create LM TF examples for XLNet."""
-
-import json
-import math
-import os
-
-import random
-from typing import Iterable, Mapping, List, Optional, Tuple
-import unicodedata
-
-# Import libraries
-
-from absl import app
-from absl import flags
-from absl import logging
-
-import dataclasses
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.bert import tokenization
-
-special_symbols = {
-    "<unk>": 0,
-    "<s>": 1,
-    "</s>": 2,
-    "<cls>": 3,
-    "<sep>": 4,
-    "<pad>": 5,
-    "<mask>": 6,
-    "<eod>": 7,
-    "<eop>": 8,
-}
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_integer("seq_length", 512,
-                     help="Sequence length.")
-flags.DEFINE_integer("reuse_length", 256,
-                     help="Number of token that can be reused as memory. "
-                     "Could be half of `seq_len`.")
-flags.DEFINE_string("input_file", None,
-                    "Input raw text file (or comma-separated list of files).")
-flags.DEFINE_string(
-    "save_dir", None,
-    "Directory for saving processed data.")
-flags.DEFINE_string("sp_model_file", "",
-                    "The path to the model used by sentence piece tokenizer.")
-flags.DEFINE_bool("use_eod_token", True,
-                  "Whether or not to include EOD tokens.")
-flags.DEFINE_bool("bi_data", True, "Whether or not to use bi-directional data.")
-flags.DEFINE_bool(
-    "do_lower_case", True,
-    "Whether to lower case the input text. Should be True for uncased "
-    "models and False for cased models.")
-flags.DEFINE_integer("per_host_batch_size", 32, "Batch size per host.")
-flags.DEFINE_integer("num_cores_per_host", 16,
-                     "The number of (TPU) cores per host.")
-flags.DEFINE_string("prefix", "", "Filename prefix.")
-flags.DEFINE_string("suffix", "", "Filename suffix.")
-
-flags.DEFINE_integer("task_id", None,
-                     "The id of the current task.")
-flags.DEFINE_integer("num_tasks", None,
-                     "The total number of tasks.")
-flags.DEFINE_integer("num_passes", 1, "The number of times to run the script.")
-
-
-@dataclasses.dataclass
-class TrainingInstance:
-  """Representation of a single XLNet Pretraining instance."""
-  data: Iterable[int]
-  segment_ids: Iterable[int]
-  boundary_indices: Iterable[int]
-  label: int
-
-  def to_feature(self) -> Mapping[str, tf.train.Feature]:
-    feat = lambda x: tf.train.Feature(int64_list=tf.train.Int64List(value=x))
-    return dict(
-        input_word_ids=feat(self.data),
-        input_type_ids=feat(self.segment_ids),
-        boundary_indices=feat(self.boundary_indices),
-        label=feat([self.label]))
-
-  def to_example(self) -> tf.train.Example:
-    return tf.train.Example(
-        features=tf.train.Features(feature=self.to_feature()))
-
-  def __str__(self):
-    def seq_to_str(seq):
-      return " ".join([str(x) for x in seq])
-
-    s = ""
-    s += "tokens: %s\n" % seq_to_str(self.data)
-    s += "segment_ids: %s\n" % seq_to_str(self.segment_ids)
-    s += "boundary_indices: %s\n" % seq_to_str(self.boundary_indices)
-    s += "label: %s\n" % self.label
-    s += "\n"
-    return s
-
-  def __repr__(self):
-    return self.__str__()
-
-
-def _preprocess_line(line: str, do_lower_case: bool = False) -> str:
-  """Preprocesses an individual raw text line.
-
-  This function will:
-    - Remove extraneous spaces.
-    - Replace `` with ", and '' with ".
-    - Replaces accents.
-    - Applies lower casing.
-
-  Args:
-    line: The input line to preprocess.
-    do_lower_case: Whether or not to lower case the text.
-
-  Returns:
-    The preprocessed line.
-
-  """
-  line = " ".join(line.split())
-  line = line.replace("``", "\"").replace("''", "\"")
-
-  # Replace accents.
-  line = unicodedata.normalize("NFKD", line)
-  line = "".join([c for c in line if not unicodedata.combining(c)])
-
-  if do_lower_case:
-    line = line.lower()
-  return line
-
-
-def preprocess_and_tokenize_input_files(
-    input_files: Iterable[str],
-    tokenizer: tokenization.FullSentencePieceTokenizer,
-    use_eod: bool = True,
-    do_lower_case: bool = False,
-    log_example_freq: int = 100000) -> List[Tuple[np.array, np.array]]:
-  """Preprocesses and encodes raw text from input files.
-
-  This function preprocesses raw text and encodes them into tokens using a
-  `SentencePieceModel` tokenization method. This also provides the sentence
-  indicator for each token.
-
-  Args:
-    input_files: The list of input file names.
-    tokenizer: The SentencePiece tokenizer that has the attribute `sp_model`.
-    use_eod: Whether or not to use an EOD indicator. If `False`, then EOD is
-      not included.
-    do_lower_case: Whether or not to apply lower casing during raw text
-      preprocessing.
-    log_example_freq: The optional field for how many lines to process before
-      emitting an info log.
-
-  Returns:
-    The preprocessed list. Each entry in the list is a tuple consisting of
-    the token IDs and the sentence IDs.
-
-  """
-  all_data = []
-  eod_symbol = special_symbols["<eod>"]
-
-  total_number_of_lines = 0
-
-  # Input file format:
-  # (1) One sentence per line. These should ideally be actual sentences, not
-  # entire paragraphs or arbitrary spans of text. (Because we use the
-  # sentence boundaries for the "next sentence prediction" task).
-  # (2) Blank lines between documents. Document boundaries are needed so
-  # that the "next sentence prediction" task doesn't span between documents.
-  for input_file in input_files:
-    line_count = 0
-    logging.info("Preprocessing %s", input_file)
-
-    all_tokens = []
-    all_sentence_ids = []
-
-    sentence_id = True
-
-    with tf.io.gfile.GFile(input_file, "rb") as reader:
-      while True:
-        line = tokenization.convert_to_unicode(reader.readline())
-        if not line:
-          break
-
-        line_count += 1
-        if line_count % log_example_freq == 0:
-          logging.info("Loading line %d", line_count)
-
-        line = line.strip()
-
-        if not line:
-          if use_eod:
-            token_ids = [eod_symbol]
-            sentence_id = not sentence_id
-          else:
-            continue
-        else:
-          preprocessed_line = _preprocess_line(
-              line=line, do_lower_case=do_lower_case)
-          token_ids = tokenization.encode_ids(
-              sp_model=tokenizer.sp_model, text=preprocessed_line)
-
-        all_tokens.extend(token_ids)
-        all_sentence_ids.extend([sentence_id] * len(token_ids))
-        sentence_id = not sentence_id
-      logging.info("Finished processing %s. Number of lines: %d",
-                   input_file, line_count)
-      if line_count == 0:
-        continue
-      total_number_of_lines += line_count
-      all_tokens = np.array(all_tokens, dtype=np.int64)
-      all_sentence_ids = np.array(all_sentence_ids, dtype=np.bool)
-      all_data.append((all_tokens, all_sentence_ids))
-
-  logging.info("Completed text preprocessing. Total number of lines: %d",
-               total_number_of_lines)
-  return all_data
-
-
-def _reshape_to_batch_dimensions(
-    tokens: np.array,
-    sentence_ids: np.array,
-    per_host_batch_size: int) -> Tuple[np.array, np.array]:
-  """Truncates and reshapes input data with a batch major dimension.
-
-  Args:
-    tokens: The input token ids. This should have the same shape as
-      `sentence_ids`.
-    sentence_ids: The input sentence ids. This should have the same shape as
-      `token_ids`.
-    per_host_batch_size: The target per-host batch size.
-
-  Returns:
-    The tuple of reshaped tokens and sentence_ids.
-  """
-  num_steps = len(tokens) // per_host_batch_size
-  truncated_data_length = num_steps * per_host_batch_size
-
-  logging.info("per_host_batch_size: %d", per_host_batch_size)
-  logging.info("num_steps: %d", num_steps)
-  def truncate_and_reshape(a):
-    return a[:truncated_data_length].reshape((per_host_batch_size, num_steps))
-
-  return (truncate_and_reshape(tokens), truncate_and_reshape(sentence_ids))
-
-
-def _create_a_and_b_segments(
-    tokens: np.array,
-    sentence_ids: np.array,
-    begin_index: int,
-    total_length: int,
-    no_cut_probability: float = 0.5):
-  """Splits segments A and B from a single instance of tokens and sentence ids.
-
-  Args:
-    tokens: The 1D input token ids. This represents an individual entry within a
-      batch.
-    sentence_ids: The 1D input sentence ids. This represents an indivdual entry
-      within a batch. This should be the same length as `tokens`.
-    begin_index: The reference beginning index to split data.
-    total_length: The target combined length of segments A and B.
-    no_cut_probability: The probability of not cutting a segment despite
-      a cut possibly existing.
-
-  Returns:
-    A tuple consisting of A data, B data, and label.
-
-  """
-  data_length = tokens.shape[0]
-  if begin_index + total_length >= data_length:
-    logging.info("[_create_segments]: begin_index %d + total_length %d >= "
-                 "data_length %d", begin_index, total_length, data_length)
-    return None
-
-  end_index = begin_index + 1
-  cut_indices = []
-
-  # Identify all indices where sentence IDs change from one to the next.
-  while end_index < data_length:
-    if sentence_ids[end_index] != sentence_ids[end_index - 1]:
-      if end_index - begin_index >= total_length:
-        break
-      cut_indices.append(end_index)
-    end_index += 1
-
-  a_begin = begin_index
-
-  if not cut_indices or random.random() < no_cut_probability:
-    # Segments A and B are contained within the same sentence.
-    label = 0
-    if not cut_indices:
-      a_end = end_index
-    else:
-      a_end = random.choice(cut_indices)
-    b_length = max(1, total_length - (a_end - a_begin))
-    b_begin = random.randint(0, data_length - 1 - b_length)
-    b_end = b_begin + b_length
-
-    while b_begin > 0 and sentence_ids[b_begin - 1] == sentence_ids[b_begin]:
-      b_begin -= 1
-    while (b_end < data_length - 1 and
-           sentence_ids[b_end - 1] == sentence_ids[b_end]):
-      b_end += 1
-  else:
-    # Segments A and B are different sentences.
-    label = 1
-    a_end = random.choice(cut_indices)
-    b_begin = a_end
-    b_end = end_index
-
-  while a_end - a_begin + b_end - b_begin > total_length:
-    if a_end - a_begin > b_end - b_begin:
-      # Delete only the right side for the LM objective.
-      a_end -= 1
-    else:
-      b_end -= 1
-  if a_end >= data_length or b_end >= data_length:
-    logging.info("[_create_segments]: a_end %d or b_end %d >= data_length %d",
-                 a_end, b_end, data_length)
-    return None
-
-  a_data = tokens[a_begin: a_end]
-  b_data = tokens[b_begin: b_end]
-  return a_data, b_data, label
-
-
-def _is_functional_piece(piece: str) -> bool:
-  return piece != "<unk>" and piece.startswith("<") and piece.endswith(">")
-
-
-def _is_start_piece(piece: str) -> bool:
-  special_pieces = set(list('!"#$%&\"()*+,-./:;?@[\\]^_`{|}~'))
-  if (piece.startswith("鈻") or piece in special_pieces):
-    return True
-  else:
-    return False
-
-
-def _get_boundary_indices(
-    data: np.array,
-    tokenizer: tokenization.FullSentencePieceTokenizer) -> np.array:
-  """Gets the boundary indices of whole words."""
-  seq_length = len(data)
-  boundary_indices = []
-  for index, piece in enumerate(tokenizer.convert_ids_to_tokens(data.tolist())):
-    if _is_start_piece(piece) and not _is_functional_piece(piece):
-      boundary_indices.append(index)
-  boundary_indices.append(seq_length)
-  return boundary_indices
-
-
-def _convert_tokens_to_instances(
-    tokens: np.array,
-    sentence_ids: np.array,
-    per_host_batch_size: int,
-    seq_length: int,
-    reuse_length: int,
-    bi_data: bool,
-    tokenizer: tokenization.FullSentencePieceTokenizer,
-    num_cores_per_host: int = 0,
-    logging_frequency: int = 500) -> List[TrainingInstance]:
-  """Converts tokens and sentence IDs into individual training instances.
-
-  The format of data in the XLNet pretraining task is very similar to the
-  BERT pretraining task. Two segments A and B are randomly sampled, and the
-  contatenation of A and B into a single sequence is used to perform
-  language modeling.
-
-  To create an XLNet Pretraining instance from a single long sequence, S:
-  - Create a segment of length `reuse_length`. This first segment represents
-    past tokens. During modeling, this segment is used to cache obtained
-    content representations for the segment recurrence mechanism.
-  - Similar to BERT, create a segment of length `seq_length` - `reuse_length`
-    composed of A and B segments.
-    For XLNet, the order is "A", "SEP", "B", "SEP", "CLS".
-
-  Args:
-    tokens: All tokens concatenated into a single list.
-    sentence_ids: All sentence IDs concatenated into a single list.
-    per_host_batch_size: The target batch size per host.
-    seq_length: The max sequence length.
-    reuse_length: The number of tokens to use from the previous segment.
-    bi_data: Whether or not to use bidirectional data.
-    tokenizer: The SentencePiece tokenizer that has the attribute `sp_model`.
-    num_cores_per_host: The number of cores per host. This is required if
-      `bi_data` = `True`.
-    logging_frequency: The frequency at which to log status updates.
-
-  Returns:
-    A list of `TrainingInstance` objects.
-  """
-  instances = []
-
-  per_core_batch_size = (per_host_batch_size // num_cores_per_host
-                         if bi_data else None)
-
-  if bi_data:
-    logging.info("Bi-directional data enabled.")
-    assert per_host_batch_size % (2 * num_cores_per_host) == 0
-    forward_tokens, forward_sentence_ids = _reshape_to_batch_dimensions(
-        tokens=tokens,
-        sentence_ids=sentence_ids,
-        per_host_batch_size=per_host_batch_size // 2)
-    forward_data_shape = (num_cores_per_host, 1, per_core_batch_size // 2, -1)
-
-    forward_tokens = forward_tokens.reshape(forward_data_shape)
-    forward_sentence_ids = forward_sentence_ids.reshape(forward_data_shape)
-
-    backwards_tokens = forward_tokens[:, :, :, ::-1]
-    backwards_sentence_ids = forward_sentence_ids[:, :, :, ::-1]
-
-    tokens = np.concatenate([forward_tokens, backwards_tokens], 1).reshape(
-        per_host_batch_size, -1)
-    sentence_ids = np.concatenate(
-        [forward_sentence_ids, backwards_sentence_ids]).reshape(
-            per_host_batch_size, -1)
-  else:
-    logging.info("Bi-directional data disabled.")
-    tokens, sentence_ids = _reshape_to_batch_dimensions(
-        tokens=tokens,
-        sentence_ids=sentence_ids,
-        per_host_batch_size=per_host_batch_size)
-
-  logging.info("Tokens shape: %s", tokens.shape)
-
-  data_length = tokens.shape[1]
-  sep = np.array([special_symbols["<sep>"]], dtype=np.int64)
-  cls = np.array([special_symbols["<cls>"]], dtype=np.int64)
-  # 2 sep, 1 cls
-  num_special_tokens = 3
-
-  data_index = 0
-  batch_number = 0
-  step_size = reuse_length if reuse_length else seq_length
-  num_batches = math.ceil(data_length / step_size)
-
-  while data_index + seq_length <= data_length:
-    if batch_number % logging_frequency == 0:
-      logging.info("Processing batch %d of %d", batch_number, num_batches)
-
-    for batch_index in range(per_host_batch_size):
-      previous_segment_tokens = tokens[
-          batch_index, data_index: data_index + reuse_length]
-
-      results = _create_a_and_b_segments(
-          tokens=tokens[batch_index],
-          sentence_ids=sentence_ids[batch_index],
-          begin_index=data_index + reuse_length,
-          total_length=seq_length - reuse_length - num_special_tokens)
-
-      if results is None:
-        logging.info("Stopping at data index: %d", data_index)
-        break
-      a_data, b_data, label = results
-
-      data = np.concatenate(
-          [previous_segment_tokens, a_data, sep, b_data, sep, cls])
-      a_length = a_data.shape[0]
-      b_length = b_data.shape[0]
-      segment_ids = ([0] * (reuse_length + a_length) + [0]
-                     + [1] * b_length + [1] + [2])
-      boundary_indices = _get_boundary_indices(tokenizer=tokenizer,
-                                               data=data)
-      assert len(data) == seq_length
-      assert len(segment_ids) == seq_length
-      assert len(boundary_indices) > 0  # pylint: disable=g-explicit-length-test
-
-      instances.append(TrainingInstance(
-          data=data,
-          segment_ids=segment_ids,
-          boundary_indices=boundary_indices,
-          label=label))
-    batch_number += 1
-    data_index += step_size
-  return instances
-
-
-def write_instances_to_tfrecord(
-    instances: Iterable[TrainingInstance],
-    save_path: str):
-  """Writes instances to TFRecord."""
-  record_writer = tf.io.TFRecordWriter(save_path)
-  logging.info("Start writing to %s.", save_path)
-
-  for i, instance in enumerate(instances):
-    if i < 5:
-      logging.info("Instance %d: %s", i, str(instance))
-    record_writer.write(instance.to_example().SerializeToString())
-
-  record_writer.close()
-  logging.info("Done writing %s.", save_path)
-
-
-def shuffle_and_combine_preprocessed_data(
-    all_data: List[Tuple[np.array, np.array]]) -> Tuple[np.array, np.array]:
-  """Shuffles and combines preprocessed token/sentence IDs from documents."""
-  document_permutation = np.random.permutation(len(all_data))
-
-  previous_sentence_id = None
-
-  all_tokens, all_sentence_ids = [], []
-  for document_index in document_permutation:
-    tokens, sentence_ids = all_data[document_index]
-    # pylint: disable=g-explicit-length-test
-    if len(tokens) == 0:
-      continue
-    if (previous_sentence_id is not None and
-        sentence_ids[0] == previous_sentence_id):
-      sentence_ids = np.logical_not(sentence_ids)
-
-    all_tokens.append(tokens)
-    all_sentence_ids.append(sentence_ids)
-
-    previous_sentence_id = sentence_ids[-1]
-
-  return np.concatenate(all_tokens), np.concatenate(all_sentence_ids)
-
-
-def get_tfrecord_name(
-    per_host_batch_size: int,
-    num_cores_per_host: int,
-    seq_length: int,
-    bi_data: bool,
-    reuse_length: int,
-    do_lower_case: bool,
-    use_eod_token: bool,
-    prefix: str = "",
-    suffix: str = "",
-    pass_id: int = 0,
-    num_passes: int = 1,
-    task_id: int = None,
-    num_tasks: int = None) -> str:
-  """Formats the resulting TFRecord name based on provided inputs."""
-  components = []
-  if prefix:
-    components.append(prefix)
-  components.append("seqlen-{}".format(seq_length))
-  if reuse_length == 0:
-    components.append("memless")
-  else:
-    components.append("reuse-{}".format(reuse_length))
-  components.append("bs-{}".format(per_host_batch_size))
-  components.append("cores-{}".format(num_cores_per_host))
-
-  if do_lower_case:
-    components.append("uncased")
-  else:
-    components.append("cased")
-  if use_eod_token:
-    components.append("eod")
-  if bi_data:
-    components.append("bi")
-  else:
-    components.append("uni")
-
-  if suffix:
-    components.append(suffix)
-
-  s = "_".join(components) + ".tfrecord"
-  if num_passes == 1 and task_id is None:
-    return s
-
-  if task_id is None:
-    num_tasks = 1
-    task_id = 0
-
-  current_shard = task_id * num_passes + pass_id
-  total_shards = num_tasks * num_passes
-  return s + "-{}-of-{}".format(current_shard, total_shards)
-
-
-def create_tfrecords(
-    tokenizer: tokenization.FullSentencePieceTokenizer,
-    input_file_or_files: str,
-    use_eod_token: bool,
-    do_lower_case: bool,
-    per_host_batch_size: int,
-    seq_length: int,
-    reuse_length: int,
-    bi_data: bool,
-    num_cores_per_host: int,
-    save_dir: str,
-    prefix: str = "",
-    suffix: str = "",
-    num_tasks: Optional[int] = None,
-    task_id: Optional[int] = None,
-    num_passes: int = 1):
-  """Runs the end-to-end preprocessing pipeline."""
-
-  logging.info("Input configuration:")
-  logging.info("input file(s): %s", input_file_or_files)
-  logging.info("use_eod_token: %s", use_eod_token)
-  logging.info("do_lower_case: %s", do_lower_case)
-  logging.info("per_host_batch_size: %d", per_host_batch_size)
-  logging.info("seq_length: %d", seq_length)
-  logging.info("reuse_length: %d", reuse_length)
-  logging.info("bi_data: %s", bi_data)
-  logging.info("num_cores_per_host: %d", num_cores_per_host)
-  logging.info("save_dir: %s", save_dir)
-  if task_id is not None and num_tasks is not None:
-    logging.info("task_id: %d", task_id)
-    logging.info("num_tasks: %d", num_tasks)
-
-  input_files = []
-  for input_pattern in input_file_or_files.split(","):
-    input_files.extend(tf.io.gfile.glob(input_pattern))
-
-  logging.info("*** Reading from input files ***")
-  for input_file in input_files:
-    logging.info("  %s", input_file)
-
-  logging.info("Shuffling the files with a fixed random seed.")
-  np.random.shuffle(input_files)
-  if num_tasks is not None:
-    assert task_id is not None
-    logging.info("Total number of input files: %d", len(input_files))
-    logging.info("Splitting into %d shards of %d files each.",
-                 num_tasks, len(input_files) // num_tasks)
-    input_files = input_files[task_id::num_tasks]
-
-  all_data = preprocess_and_tokenize_input_files(
-      input_files=input_files,
-      tokenizer=tokenizer,
-      use_eod=use_eod_token,
-      do_lower_case=do_lower_case)
-  for pass_id in range(num_passes):
-    logging.info("Beginning pass %d of %d", pass_id, num_passes)
-    tokens, sentence_ids = shuffle_and_combine_preprocessed_data(all_data)
-
-    assert len(tokens) == len(sentence_ids)
-
-    filename = get_tfrecord_name(
-        per_host_batch_size=per_host_batch_size,
-        num_cores_per_host=num_cores_per_host,
-        seq_length=seq_length,
-        bi_data=bi_data,
-        use_eod_token=use_eod_token,
-        reuse_length=reuse_length,
-        do_lower_case=do_lower_case,
-        prefix=prefix,
-        suffix=suffix,
-        pass_id=pass_id,
-        num_passes=num_passes,
-        num_tasks=num_tasks,
-        task_id=task_id)
-    save_path = os.path.join(save_dir, filename)
-    if os.path.exists(save_path):
-      # If the path already exists, then we were probably preempted but
-      # previously wrote this file.
-      logging.info("%s already exists, skipping this batch.", save_path)
-    else:
-      instances = _convert_tokens_to_instances(
-          tokenizer=tokenizer,
-          tokens=tokens,
-          sentence_ids=sentence_ids,
-          per_host_batch_size=per_host_batch_size,
-          seq_length=seq_length,
-          reuse_length=reuse_length,
-          bi_data=bi_data,
-          num_cores_per_host=num_cores_per_host)
-      write_instances_to_tfrecord(instances=instances, save_path=save_path)
-
-  if task_id is None or task_id == 0:
-    corpus_info = {
-        "vocab_size": 32000,
-        "per_host_batch_size": per_host_batch_size,
-        "num_cores_per_host": num_cores_per_host,
-        "seq_length": seq_length,
-        "reuse_length": reuse_length,
-        "do_lower_case": do_lower_case,
-        "bi_data": bi_data,
-        "use_eod_token": use_eod_token,
-    }
-    corpus_fname = os.path.basename(filename) + ".json"
-    corpus_destination = os.path.join(save_dir, corpus_fname)
-    logging.info("Saving corpus info to %s", corpus_destination)
-
-    with tf.io.gfile.GFile(corpus_destination, "w") as fp:
-      json.dump(corpus_info, fp)
-
-
-def main(_):
-  tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
-  create_tfrecords(
-      tokenizer=tokenizer,
-      input_file_or_files=FLAGS.input_file,
-      use_eod_token=FLAGS.use_eod_token,
-      do_lower_case=FLAGS.do_lower_case,
-      per_host_batch_size=FLAGS.per_host_batch_size,
-      seq_length=FLAGS.seq_length,
-      reuse_length=FLAGS.reuse_length,
-      bi_data=FLAGS.bi_data,
-      num_cores_per_host=FLAGS.num_cores_per_host,
-      save_dir=FLAGS.save_dir,
-      prefix=FLAGS.prefix,
-      suffix=FLAGS.suffix,
-      num_tasks=FLAGS.num_tasks,
-      task_id=FLAGS.task_id,
-      num_passes=FLAGS.num_passes)
-
-
-if __name__ == "__main__":
-  np.random.seed(0)
-  logging.set_verbosity(logging.INFO)
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_xlnet_pretraining_data_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_xlnet_pretraining_data_test.py
deleted file mode 100644
index 94cf00843489eb6edac8f5133d296ff3cd27a913..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/create_xlnet_pretraining_data_test.py
+++ /dev/null
@@ -1,371 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.data.create_xlnet_pretraining_data."""
-import os
-import tempfile
-from typing import List
-
-from absl import logging
-from absl.testing import parameterized
-
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.data import create_xlnet_pretraining_data as cpd
-
-_VOCAB_WORDS = ["vocab_1", "vocab_2"]
-
-
-# pylint: disable=invalid-name
-def _create_files(
-    temp_dir: str, file_contents: List[List[str]]) -> List[str]:
-  """Writes arbitrary documents into files."""
-  root_dir = tempfile.mkdtemp(dir=temp_dir)
-  files = []
-
-  for i, file_content in enumerate(file_contents):
-    destination = os.path.join(root_dir, "%d.txt" % i)
-    with open(destination, "wb") as f:
-      for line in file_content:
-        f.write(line.encode("utf-8"))
-    files.append(destination)
-  return files
-
-
-def _get_mock_tokenizer():
-  """Creates a mock tokenizer."""
-
-  class MockSpieceModel:
-    """Mock Spiece model for testing."""
-
-    def __init__(self):
-      self._special_piece_to_id = {
-          "<unk>": 0,
-      }
-      for piece in set(list('!"#$%&\"()*+,-./:;?@[\\]^_`{|}~')):
-        self._special_piece_to_id[piece] = 1
-
-    def EncodeAsPieces(self, inputs: str) -> List[str]:
-      return inputs
-
-    def SampleEncodeAsPieces(self,
-                             inputs: str,
-                             nbest_size: int,
-                             theta: float) -> List[str]:
-      del nbest_size, theta
-      return inputs
-
-    def PieceToId(self, piece: str) -> int:
-      return ord(piece[0])
-
-    def IdToPiece(self, id_: int) -> str:
-      return chr(id_) * 3
-
-  class Tokenizer:
-    """Mock Tokenizer for testing."""
-
-    def __init__(self):
-      self.sp_model = MockSpieceModel()
-
-    def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
-      return [self.sp_model.IdToPiece(id_) for id_ in ids]
-
-  return Tokenizer()
-
-
-class PreprocessDataTest(tf.test.TestCase):
-
-  def test_remove_extraneous_space(self):
-    line = "   abc   "
-    output = cpd._preprocess_line(line)
-    self.assertEqual(output, "abc")
-
-  def test_symbol_replacements(self):
-    self.assertEqual(cpd._preprocess_line("``abc``"), "\"abc\"")
-    self.assertEqual(cpd._preprocess_line("''abc''"), "\"abc\"")
-
-  def test_accent_replacements(self):
-    self.assertEqual(cpd._preprocess_line("氓bc"), "abc")
-
-  def test_lower_case(self):
-    self.assertEqual(cpd._preprocess_line("ABC", do_lower_case=True), "abc")
-
-  def test_end_to_end(self):
-    self.assertEqual(
-        cpd._preprocess_line("HelLo ``w贸rLd``", do_lower_case=True),
-        "hello \"world\"")
-
-
-class PreprocessAndTokenizeFilesTest(tf.test.TestCase):
-
-  def test_basic_end_to_end(self):
-    documents = [
-        [
-            "This is sentence 1.\n",
-            "This is sentence 2.\n",
-            "Sentence 3 is what this is.\n",
-        ],
-        [
-            "This is the second document.\n",
-            "This is the second line of the second document.\n"
-        ],
-    ]
-    input_files = _create_files(temp_dir=self.get_temp_dir(),
-                                file_contents=documents)
-    all_data = cpd.preprocess_and_tokenize_input_files(
-        input_files=input_files,
-        tokenizer=_get_mock_tokenizer(),
-        log_example_freq=1)
-
-    self.assertEqual(len(all_data), len(documents))
-    for token_ids, sentence_ids in all_data:
-      self.assertEqual(len(token_ids), len(sentence_ids))
-
-  def test_basic_correctness(self):
-    documents = [["a\n", "b\n", "c\n"]]
-    input_files = _create_files(temp_dir=self.get_temp_dir(),
-                                file_contents=documents)
-    all_data = cpd.preprocess_and_tokenize_input_files(
-        input_files=input_files,
-        tokenizer=_get_mock_tokenizer(),
-        log_example_freq=1)
-
-    token_ids, sentence_ids = all_data[0]
-
-    self.assertAllClose(token_ids, [97, 98, 99])
-    self.assertAllClose(sentence_ids, [True, False, True])
-
-  def test_correctness_with_spaces_and_accents(self):
-    documents = [[
-        "       氓   \n",
-        "b          \n",
-        "   c      \n",
-    ]]
-    input_files = _create_files(temp_dir=self.get_temp_dir(),
-                                file_contents=documents)
-    all_data = cpd.preprocess_and_tokenize_input_files(
-        input_files=input_files,
-        tokenizer=_get_mock_tokenizer(),
-        log_example_freq=1)
-
-    token_ids, sentence_ids = all_data[0]
-
-    self.assertAllClose(token_ids, [97, 98, 99])
-    self.assertAllClose(sentence_ids, [True, False, True])
-
-
-class BatchReshapeTests(tf.test.TestCase):
-
-  def test_basic_functionality(self):
-    per_host_batch_size = 3
-    mock_shape = (20,)
-
-    # Should truncate and reshape.
-    expected_result_shape = (3, 6)
-
-    tokens = np.zeros(mock_shape)
-    sentence_ids = np.zeros(mock_shape)
-
-    reshaped_data = cpd._reshape_to_batch_dimensions(
-        tokens=tokens,
-        sentence_ids=sentence_ids,
-        per_host_batch_size=per_host_batch_size)
-    for values in reshaped_data:
-      self.assertEqual(len(values.flatten()) % per_host_batch_size, 0)
-      self.assertAllClose(values.shape, expected_result_shape)
-
-
-class CreateSegmentsTest(tf.test.TestCase):
-
-  def test_basic_functionality(self):
-    data_length = 10
-    tokens = np.arange(data_length)
-    sentence_ids = np.concatenate([np.zeros(data_length // 2),
-                                   np.ones(data_length // 2)])
-    begin_index = 0
-    total_length = 8
-    a_data, b_data, label = cpd._create_a_and_b_segments(
-        tokens=tokens,
-        sentence_ids=sentence_ids,
-        begin_index=begin_index,
-        total_length=total_length,
-        no_cut_probability=0.)
-    self.assertAllClose(a_data, [0, 1, 2, 3])
-    self.assertAllClose(b_data, [5, 6, 7, 8])
-    self.assertEqual(label, 1)
-
-  def test_no_cut(self):
-    data_length = 10
-    tokens = np.arange(data_length)
-    sentence_ids = np.zeros(data_length)
-
-    begin_index = 0
-    total_length = 8
-    a_data, b_data, label = cpd._create_a_and_b_segments(
-        tokens=tokens,
-        sentence_ids=sentence_ids,
-        begin_index=begin_index,
-        total_length=total_length,
-        no_cut_probability=0.)
-    self.assertGreater(len(a_data), 0)
-    self.assertGreater(len(b_data), 0)
-    self.assertEqual(label, 0)
-
-  def test_no_cut_with_probability(self):
-    data_length = 10
-    tokens = np.arange(data_length)
-    sentence_ids = np.concatenate([np.zeros(data_length // 2),
-                                   np.ones(data_length // 2)])
-    begin_index = 0
-    total_length = 8
-    a_data, b_data, label = cpd._create_a_and_b_segments(
-        tokens=tokens,
-        sentence_ids=sentence_ids,
-        begin_index=begin_index,
-        total_length=total_length,
-        no_cut_probability=1.)
-    self.assertGreater(len(a_data), 0)
-    self.assertGreater(len(b_data), 0)
-    self.assertEqual(label, 0)
-
-
-class CreateInstancesTest(tf.test.TestCase):
-  """Tests conversions of Token/Sentence IDs to training instances."""
-
-  def test_basic(self):
-    data_length = 12
-    tokens = np.arange(data_length)
-    sentence_ids = np.zeros(data_length)
-    seq_length = 8
-    instances = cpd._convert_tokens_to_instances(
-        tokens=tokens,
-        sentence_ids=sentence_ids,
-        per_host_batch_size=2,
-        seq_length=seq_length,
-        reuse_length=4,
-        tokenizer=_get_mock_tokenizer(),
-        bi_data=False,
-        num_cores_per_host=1,
-        logging_frequency=1)
-    for instance in instances:
-      self.assertEqual(len(instance.data), seq_length)
-      self.assertEqual(len(instance.segment_ids), seq_length)
-      self.assertIsInstance(instance.label, int)
-      self.assertIsInstance(instance.boundary_indices, list)
-
-
-class TFRecordPathTests(tf.test.TestCase):
-
-  def test_basic(self):
-    base_kwargs = dict(
-        per_host_batch_size=1,
-        num_cores_per_host=1,
-        seq_length=2,
-        reuse_length=1)
-
-    config1 = dict(
-        prefix="test",
-        suffix="",
-        bi_data=True,
-        use_eod_token=False,
-        do_lower_case=True)
-    config1.update(base_kwargs)
-    expectation1 = "test_seqlen-2_reuse-1_bs-1_cores-1_uncased_bi.tfrecord"
-    self.assertEqual(cpd.get_tfrecord_name(**config1), expectation1)
-
-    config2 = dict(
-        prefix="",
-        suffix="test",
-        bi_data=False,
-        use_eod_token=False,
-        do_lower_case=False)
-    config2.update(base_kwargs)
-    expectation2 = "seqlen-2_reuse-1_bs-1_cores-1_cased_uni_test.tfrecord"
-    self.assertEqual(cpd.get_tfrecord_name(**config2), expectation2)
-
-    config3 = dict(
-        prefix="",
-        suffix="",
-        use_eod_token=True,
-        bi_data=False,
-        do_lower_case=True)
-    config3.update(base_kwargs)
-    expectation3 = "seqlen-2_reuse-1_bs-1_cores-1_uncased_eod_uni.tfrecord"
-    self.assertEqual(cpd.get_tfrecord_name(**config3), expectation3)
-
-
-class TestCreateTFRecords(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.named_parameters(
-      ("bi_data_only", True, False, False),
-      ("eod_token_only", False, True, True),
-      ("lower_case_only", False, False, True),
-      ("all_enabled", True, True, True),
-      )
-  def test_end_to_end(self,
-                      bi_data: bool,
-                      use_eod_token: bool,
-                      do_lower_case: bool):
-    tokenizer = _get_mock_tokenizer()
-
-    num_documents = 5
-    sentences_per_document = 10
-    document_length = 50
-
-    documents = [
-        ["a " * document_length for _ in range(sentences_per_document)]
-        for _ in range(num_documents)]
-
-    save_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    files = _create_files(temp_dir=self.get_temp_dir(), file_contents=documents)
-
-    cpd.create_tfrecords(
-        tokenizer=tokenizer,
-        input_file_or_files=",".join(files),
-        use_eod_token=use_eod_token,
-        do_lower_case=do_lower_case,
-        per_host_batch_size=8,
-        seq_length=8,
-        reuse_length=4,
-        bi_data=bi_data,
-        num_cores_per_host=2,
-        save_dir=save_dir)
-
-    self.assertTrue(any(filter(lambda x: x.endswith(".json"),
-                               os.listdir(save_dir))))
-    self.assertTrue(any(filter(lambda x: x.endswith(".tfrecord"),
-                               os.listdir(save_dir))))
-
-
-if __name__ == "__main__":
-  np.random.seed(0)
-  logging.set_verbosity(logging.INFO)
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/data_loader.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/data_loader.py
deleted file mode 100644
index 03657e83acff22f75c110832ff1d9da1cb344ed7..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/data_loader.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""An abstraction that NLP models define input pipelines."""
-
-import abc
-from typing import Optional
-
-import tensorflow as tf
-
-
-class DataLoader(metaclass=abc.ABCMeta):
-  """An abstract class defining the APIs for tf.data input pipeline."""
-
-  @abc.abstractmethod
-  def load(
-      self,
-      input_context: Optional[tf.distribute.InputContext] = None
-  ) -> tf.data.Dataset:
-    """Implements DataLoader load method.
-
-    Builds the entire input pipeline inside the load method. Users can define
-    states inside the DataLoader class and returns a tf.data dataset
-    object.
-
-    Args:
-      input_context: This is a context class that is passed to the user's input
-        function and contains information about the compute replicas and input
-        pipelines. This object is used for multi-host inputs and passed by the
-        distribution strategy.
-
-    Returns:
-      A per-host tf.data dataset. Note that, we usually create the distributed
-        dataset through the load method, so we should not directly return a
-        distributed dataset here.
-    """
-    pass
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/data_loader_factory.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/data_loader_factory.py
deleted file mode 100644
index d9506145efb3c73f979d179f8d3c311b92790434..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/data_loader_factory.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A global factory to access NLP registered data loaders."""
-
-from official.core import registry
-
-_REGISTERED_DATA_LOADER_CLS = {}
-
-
-def register_data_loader_cls(data_config_cls):
-  """Decorates a factory of DataLoader for lookup by a subclass of DataConfig.
-
-  This decorator supports registration of data loaders as follows:
-
-  ```
-  @dataclasses.dataclass
-  class MyDataConfig(DataConfig):
-    # Add fields here.
-    pass
-
-  @register_data_loader_cls(MyDataConfig)
-  class MyDataLoader:
-    # Inherits def __init__(self, data_config).
-    pass
-
-  my_data_config = MyDataConfig()
-
-  # Returns MyDataLoader(my_data_config).
-  my_loader = get_data_loader(my_data_config)
-  ```
-
-  Args:
-    data_config_cls: a subclass of DataConfig (*not* an instance
-      of DataConfig).
-
-  Returns:
-    A callable for use as class decorator that registers the decorated class
-      for creation from an instance of data_config_cls.
-  """
-  return registry.register(_REGISTERED_DATA_LOADER_CLS, data_config_cls)
-
-
-def get_data_loader(data_config):
-  """Creates a data_loader from data_config."""
-  return registry.lookup(_REGISTERED_DATA_LOADER_CLS, data_config.__class__)(
-      data_config)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/data_loader_factory_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/data_loader_factory_test.py
deleted file mode 100644
index 707f6107e5f9f193685f110d4450088f1bc6194d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/data_loader_factory_test.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.data.data_loader_factory."""
-
-import dataclasses
-import tensorflow as tf
-
-from official.core import config_definitions as cfg
-from official.nlp.data import data_loader_factory
-
-
-@dataclasses.dataclass
-class MyDataConfig(cfg.DataConfig):
-  is_training: bool = True
-
-
-@data_loader_factory.register_data_loader_cls(MyDataConfig)
-class MyDataLoader:
-
-  def __init__(self, params):
-    self.params = params
-
-
-class DataLoaderFactoryTest(tf.test.TestCase):
-
-  def test_register_and_load(self):
-    train_config = MyDataConfig()
-    train_loader = data_loader_factory.get_data_loader(train_config)
-    self.assertTrue(train_loader.params.is_training)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dataloader.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dataloader.py
deleted file mode 100644
index efe864a083189ea75221bbb0073cef52f2c96c18..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dataloader.py
+++ /dev/null
@@ -1,620 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Loads dataset for the BERT pretraining task."""
-from typing import Mapping, Optional
-
-from absl import logging
-
-import dataclasses
-import numpy as np
-import tensorflow as tf
-from official.core import config_definitions as cfg
-from official.core import input_reader
-from official.nlp.data import data_loader
-from official.nlp.data import data_loader_factory
-
-
-@dataclasses.dataclass
-class BertPretrainDataConfig(cfg.DataConfig):
-  """Data config for BERT pretraining task (tasks/masked_lm)."""
-  input_path: str = ''
-  global_batch_size: int = 512
-  is_training: bool = True
-  seq_length: int = 512
-  max_predictions_per_seq: int = 76
-  use_next_sentence_label: bool = True
-  use_position_id: bool = False
-  # Historically, BERT implementations take `input_ids` and `segment_ids` as
-  # feature names. Inside the TF Model Garden implementation, the Keras model
-  # inputs are set as `input_word_ids` and `input_type_ids`. When
-  # v2_feature_names is True, the data loader assumes the tf.Examples use
-  # `input_word_ids` and `input_type_ids` as keys.
-  use_v2_feature_names: bool = False
-
-
-@data_loader_factory.register_data_loader_cls(BertPretrainDataConfig)
-class BertPretrainDataLoader(data_loader.DataLoader):
-  """A class to load dataset for bert pretraining task."""
-
-  def __init__(self, params):
-    """Inits `BertPretrainDataLoader` class.
-
-    Args:
-      params: A `BertPretrainDataConfig` object.
-    """
-    self._params = params
-    self._seq_length = params.seq_length
-    self._max_predictions_per_seq = params.max_predictions_per_seq
-    self._use_next_sentence_label = params.use_next_sentence_label
-    self._use_position_id = params.use_position_id
-
-  def _name_to_features(self):
-    name_to_features = {
-        'input_mask':
-            tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'masked_lm_positions':
-            tf.io.FixedLenFeature([self._max_predictions_per_seq], tf.int64),
-        'masked_lm_ids':
-            tf.io.FixedLenFeature([self._max_predictions_per_seq], tf.int64),
-        'masked_lm_weights':
-            tf.io.FixedLenFeature([self._max_predictions_per_seq], tf.float32),
-    }
-    if self._params.use_v2_feature_names:
-      name_to_features.update({
-          'input_word_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-          'input_type_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-      })
-    else:
-      name_to_features.update({
-          'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-          'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-      })
-    if self._use_next_sentence_label:
-      name_to_features['next_sentence_labels'] = tf.io.FixedLenFeature([1],
-                                                                       tf.int64)
-    if self._use_position_id:
-      name_to_features['position_ids'] = tf.io.FixedLenFeature(
-          [self._seq_length], tf.int64)
-    return name_to_features
-
-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
-    name_to_features = self._name_to_features()
-    example = tf.io.parse_single_example(record, name_to_features)
-
-    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-    # So cast all int64 to int32.
-    for name in list(example.keys()):
-      t = example[name]
-      if t.dtype == tf.int64:
-        t = tf.cast(t, tf.int32)
-      example[name] = t
-
-    return example
-
-  def _parse(self, record: Mapping[str, tf.Tensor]):
-    """Parses raw tensors into a dict of tensors to be consumed by the model."""
-    x = {
-        'input_mask': record['input_mask'],
-        'masked_lm_positions': record['masked_lm_positions'],
-        'masked_lm_ids': record['masked_lm_ids'],
-        'masked_lm_weights': record['masked_lm_weights'],
-    }
-    if self._params.use_v2_feature_names:
-      x['input_word_ids'] = record['input_word_ids']
-      x['input_type_ids'] = record['input_type_ids']
-    else:
-      x['input_word_ids'] = record['input_ids']
-      x['input_type_ids'] = record['segment_ids']
-    if self._use_next_sentence_label:
-      x['next_sentence_labels'] = record['next_sentence_labels']
-    if self._use_position_id:
-      x['position_ids'] = record['position_ids']
-
-    return x
-
-  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
-    """Returns a tf.dataset.Dataset."""
-    reader = input_reader.InputReader(
-        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
-    return reader.read(input_context)
-
-
-@dataclasses.dataclass
-class XLNetPretrainDataConfig(cfg.DataConfig):
-  """Data config for XLNet pretraining task.
-
-  Attributes:
-    input_path: See base class.
-    global_batch_size: See base calss.
-    is_training: See base class.
-    seq_length: The length of each sequence.
-    max_predictions_per_seq: The number of predictions per sequence.
-    reuse_length: The number of tokens in a previous segment to reuse. This
-      should be the same value used during pretrain data creation.
-    sample_strategy: The strategy used to sample factorization permutations.
-      Possible values: 'single_token', 'whole_word', 'token_span', 'word_span'.
-    min_num_tokens: The minimum number of tokens to sample in a span.
-      This is used when `sample_strategy` is 'token_span'.
-    max_num_tokens: The maximum number of tokens to sample in a span.
-      This is used when `sample_strategy` is 'token_span'.
-    min_num_words: The minimum number of words to sample in a span.
-      This is used when `sample_strategy` is 'word_span'.
-    max_num_words: The maximum number of words to sample in a span.
-      This is used when `sample_strategy` is 'word_span'.
-    permutation_size: The length of the longest permutation. This can be set
-      to `reuse_length`. This should NOT be greater than `reuse_length`,
-      otherwise this may introduce data leaks.
-    leak_ratio: The percentage of masked tokens that are leaked.
-    segment_sep_id: The ID of the SEP token used when preprocessing
-      the dataset.
-    segment_cls_id: The ID of the CLS token used when preprocessing
-      the dataset.
-
-  """
-  input_path: str = ''
-  global_batch_size: int = 512
-  is_training: bool = True
-  seq_length: int = 512
-  max_predictions_per_seq: int = 76
-  reuse_length: int = 256
-  sample_strategy: str = 'word_span'
-  min_num_tokens: int = 1
-  max_num_tokens: int = 5
-  min_num_words: int = 1
-  max_num_words: int = 5
-  permutation_size: int = 256
-  leak_ratio: float = 0.1
-  segment_sep_id: int = 4
-  segment_cls_id: int = 3
-
-
-@data_loader_factory.register_data_loader_cls(XLNetPretrainDataConfig)
-class XLNetPretrainDataLoader(data_loader.DataLoader):
-  """A class to load dataset for xlnet pretraining task."""
-
-  def __init__(self, params: XLNetPretrainDataConfig):
-    """Inits `XLNetPretrainDataLoader` class.
-
-    Args:
-      params: A `XLNetPretrainDataConfig` object.
-    """
-    self._params = params
-    self._seq_length = params.seq_length
-    self._max_predictions_per_seq = params.max_predictions_per_seq
-    self._reuse_length = params.reuse_length
-    self._num_replicas_in_sync = None
-    self._permutation_size = params.permutation_size
-    self._sep_id = params.segment_sep_id
-    self._cls_id = params.segment_cls_id
-    self._sample_strategy = params.sample_strategy
-    self._leak_ratio = params.leak_ratio
-
-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
-    name_to_features = {
-        'input_word_ids':
-            tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'input_type_ids':
-            tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'boundary_indices':
-            tf.io.VarLenFeature(tf.int64),
-    }
-    example = tf.io.parse_single_example(record, name_to_features)
-
-    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-    # So cast all int64 to int32.
-    for name in list(example.keys()):
-      t = example[name]
-      if t.dtype == tf.int64:
-        t = tf.cast(t, tf.int32)
-      example[name] = t
-
-    return example
-
-  def _parse(self, record: Mapping[str, tf.Tensor]):
-    """Parses raw tensors into a dict of tensors to be consumed by the model."""
-    x = {}
-
-    inputs = record['input_word_ids']
-    x['input_type_ids'] = record['input_type_ids']
-
-    if self._sample_strategy in ['whole_word', 'word_span']:
-      boundary = tf.sparse.to_dense(record['boundary_indices'])
-    else:
-      boundary = None
-
-    input_mask = self._online_sample_mask(inputs=inputs, boundary=boundary)
-
-    if self._reuse_length > 0:
-      if self._permutation_size > self._reuse_length:
-        logging.warning(
-            '`permutation_size` is greater than `reuse_length` (%d > %d).'
-            'This may introduce data leakage.',
-            self._permutation_size, self._reuse_length)
-
-      # Enable the memory mechanism.
-      # Permute the reuse and non-reuse segments separately.
-      non_reuse_len = self._seq_length - self._reuse_length
-      if not (self._reuse_length % self._permutation_size == 0
-              and non_reuse_len % self._permutation_size == 0):
-        raise ValueError('`reuse_length` and `seq_length` should both be '
-                         'a multiple of `permutation_size`.')
-
-      # Creates permutation mask and target mask for the first reuse_len tokens.
-      # The tokens in this part are reused from the last sequence.
-      perm_mask_0, target_mask_0, tokens_0, masked_0 = self._get_factorization(
-          inputs=inputs[:self._reuse_length],
-          input_mask=input_mask[:self._reuse_length])
-
-      # Creates permutation mask and target mask for the rest of tokens in
-      # current example, which are concatentation of two new segments.
-      perm_mask_1, target_mask_1, tokens_1, masked_1 = self._get_factorization(
-          inputs[self._reuse_length:], input_mask[self._reuse_length:])
-
-      perm_mask_0 = tf.concat(
-          [perm_mask_0,
-           tf.zeros([self._reuse_length, non_reuse_len], dtype=tf.int32)],
-          axis=1)
-      perm_mask_1 = tf.concat(
-          [tf.ones([non_reuse_len, self._reuse_length], dtype=tf.int32),
-           perm_mask_1], axis=1)
-      perm_mask = tf.concat([perm_mask_0, perm_mask_1], axis=0)
-      target_mask = tf.concat([target_mask_0, target_mask_1], axis=0)
-      tokens = tf.concat([tokens_0, tokens_1], axis=0)
-      masked_tokens = tf.concat([masked_0, masked_1], axis=0)
-    else:
-      # Disable the memory mechanism.
-      if self._seq_length % self._permutation_size != 0:
-        raise ValueError('`seq_length` should be a multiple of '
-                         '`permutation_size`.')
-      # Permute the entire sequence together
-      perm_mask, target_mask, tokens, masked_tokens = self._get_factorization(
-          inputs=inputs, input_mask=input_mask)
-    x['permutation_mask'] = tf.reshape(
-        perm_mask, [self._seq_length, self._seq_length])
-    x['input_word_ids'] = tokens
-    x['masked_tokens'] = masked_tokens
-
-    target = tokens
-    if self._max_predictions_per_seq is not None:
-      indices = tf.range(self._seq_length, dtype=tf.int32)
-      bool_target_mask = tf.cast(target_mask, tf.bool)
-      indices = tf.boolean_mask(indices, bool_target_mask)
-
-      # account for extra padding due to CLS/SEP.
-      actual_num_predict = tf.shape(indices)[0]
-      pad_len = self._max_predictions_per_seq - actual_num_predict
-
-      target_mapping = tf.one_hot(indices, self._seq_length, dtype=tf.int32)
-      paddings = tf.zeros([pad_len, self._seq_length],
-                          dtype=target_mapping.dtype)
-      target_mapping = tf.concat([target_mapping, paddings], axis=0)
-      x['target_mapping'] = tf.reshape(
-          target_mapping, [self._max_predictions_per_seq, self._seq_length])
-
-      target = tf.boolean_mask(target, bool_target_mask)
-      paddings = tf.zeros([pad_len], dtype=target.dtype)
-      target = tf.concat([target, paddings], axis=0)
-      x['target'] = tf.reshape(target, [self._max_predictions_per_seq])
-
-      target_mask = tf.concat([
-          tf.ones([actual_num_predict], dtype=tf.int32),
-          tf.zeros([pad_len], dtype=tf.int32)
-      ], axis=0)
-      x['target_mask'] = tf.reshape(target_mask,
-                                    [self._max_predictions_per_seq])
-    else:
-      x['target'] = tf.reshape(target, [self._seq_length])
-      x['target_mask'] = tf.reshape(target_mask, [self._seq_length])
-    return x
-
-  def _index_pair_to_mask(self,
-                          begin_indices: tf.Tensor,
-                          end_indices: tf.Tensor,
-                          inputs: tf.Tensor) -> tf.Tensor:
-    """Converts beginning and end indices into an actual mask."""
-    non_func_mask = tf.logical_and(
-        tf.not_equal(inputs, self._sep_id), tf.not_equal(inputs, self._cls_id))
-    all_indices = tf.where(
-        non_func_mask,
-        tf.range(self._seq_length, dtype=tf.int32),
-        tf.constant(-1, shape=[self._seq_length], dtype=tf.int32))
-    candidate_matrix = tf.cast(
-        tf.logical_and(all_indices[None, :] >= begin_indices[:, None],
-                       all_indices[None, :] < end_indices[:, None]), tf.float32)
-    cumsum_matrix = tf.reshape(
-        tf.cumsum(tf.reshape(candidate_matrix, [-1])), [-1, self._seq_length])
-    masked_matrix = tf.cast(cumsum_matrix <= self._max_predictions_per_seq,
-                            tf.float32)
-    target_mask = tf.reduce_sum(candidate_matrix * masked_matrix, axis=0)
-    return tf.cast(target_mask, tf.bool)
-
-  def _single_token_mask(self, inputs: tf.Tensor) -> tf.Tensor:
-    """Samples individual tokens as prediction targets."""
-    all_indices = tf.range(self._seq_length, dtype=tf.int32)
-    non_func_mask = tf.logical_and(
-        tf.not_equal(inputs, self._sep_id), tf.not_equal(inputs, self._cls_id))
-    non_func_indices = tf.boolean_mask(all_indices, non_func_mask)
-
-    masked_pos = tf.random.shuffle(non_func_indices)
-    masked_pos = tf.sort(masked_pos[:self._max_predictions_per_seq])
-
-    sparse_indices = tf.stack(
-        [tf.zeros_like(masked_pos), masked_pos], axis=-1)
-    sparse_indices = tf.cast(sparse_indices, tf.int64)
-
-    sparse_indices = tf.sparse.SparseTensor(
-        sparse_indices,
-        values=tf.ones_like(masked_pos),
-        dense_shape=(1, self._seq_length))
-
-    target_mask = tf.sparse.to_dense(
-        sp_input=sparse_indices,
-        default_value=0)
-
-    return tf.squeeze(tf.cast(target_mask, tf.bool))
-
-  def _whole_word_mask(self,
-                       inputs: tf.Tensor,
-                       boundary: tf.Tensor) -> tf.Tensor:
-    """Samples whole words as prediction targets."""
-    pair_indices = tf.concat([boundary[:-1, None], boundary[1:, None]], axis=1)
-    cand_pair_indices = tf.random.shuffle(
-        pair_indices)[:self._max_predictions_per_seq]
-    begin_indices = cand_pair_indices[:, 0]
-    end_indices = cand_pair_indices[:, 1]
-
-    return self._index_pair_to_mask(
-        begin_indices=begin_indices,
-        end_indices=end_indices,
-        inputs=inputs)
-
-  def _token_span_mask(self, inputs: tf.Tensor) -> tf.Tensor:
-    """Samples token spans as prediction targets."""
-    min_num_tokens = self._params.min_num_tokens
-    max_num_tokens = self._params.max_num_tokens
-
-    mask_alpha = self._seq_length / self._max_predictions_per_seq
-    round_to_int = lambda x: tf.cast(tf.round(x), tf.int32)
-
-    # Sample span lengths from a zipf distribution
-    span_len_seq = np.arange(min_num_tokens, max_num_tokens + 1)
-    probs = np.array([1.0 / (i + 1) for i in span_len_seq])
-
-    probs /= np.sum(probs)
-    logits = tf.constant(np.log(probs), dtype=tf.float32)
-    span_lens = tf.random.categorical(
-        logits=logits[None],
-        num_samples=self._max_predictions_per_seq,
-        dtype=tf.int32,
-    )[0] + min_num_tokens
-
-    # Sample the ratio [0.0, 1.0) of left context lengths
-    span_lens_float = tf.cast(span_lens, tf.float32)
-    left_ratio = tf.random.uniform(
-        shape=[self._max_predictions_per_seq], minval=0.0, maxval=1.0)
-    left_ctx_len = left_ratio * span_lens_float * (mask_alpha - 1)
-    left_ctx_len = round_to_int(left_ctx_len)
-
-    # Compute the offset from left start to the right end
-    right_offset = round_to_int(span_lens_float * mask_alpha) - left_ctx_len
-
-    # Get the actual begin and end indices
-    begin_indices = (
-        tf.cumsum(left_ctx_len) + tf.cumsum(right_offset, exclusive=True))
-    end_indices = begin_indices + span_lens
-
-    # Remove out of range indices
-    valid_idx_mask = end_indices < self._seq_length
-    begin_indices = tf.boolean_mask(begin_indices, valid_idx_mask)
-    end_indices = tf.boolean_mask(end_indices, valid_idx_mask)
-
-    # Shuffle valid indices
-    num_valid = tf.cast(tf.shape(begin_indices)[0], tf.int32)
-    order = tf.random.shuffle(tf.range(num_valid, dtype=tf.int32))
-    begin_indices = tf.gather(begin_indices, order)
-    end_indices = tf.gather(end_indices, order)
-
-    return self._index_pair_to_mask(
-        begin_indices=begin_indices,
-        end_indices=end_indices,
-        inputs=inputs)
-
-  def _word_span_mask(self,
-                      inputs: tf.Tensor,
-                      boundary: tf.Tensor):
-    """Sample whole word spans as prediction targets."""
-    min_num_words = self._params.min_num_words
-    max_num_words = self._params.max_num_words
-
-    # Note: 1.2 is the token-to-word ratio
-    mask_alpha = self._seq_length / self._max_predictions_per_seq / 1.2
-    round_to_int = lambda x: tf.cast(tf.round(x), tf.int32)
-
-    # Sample span lengths from a zipf distribution
-    span_len_seq = np.arange(min_num_words, max_num_words + 1)
-    probs = np.array([1.0 / (i + 1) for i in span_len_seq])
-    probs /= np.sum(probs)
-    logits = tf.constant(np.log(probs), dtype=tf.float32)
-
-    # Sample `num_predict` words here: note that this is over sampling
-    span_lens = tf.random.categorical(
-        logits=logits[None],
-        num_samples=self._max_predictions_per_seq,
-        dtype=tf.int32,
-    )[0] + min_num_words
-
-    # Sample the ratio [0.0, 1.0) of left context lengths
-    span_lens_float = tf.cast(span_lens, tf.float32)
-    left_ratio = tf.random.uniform(
-        shape=[self._max_predictions_per_seq], minval=0.0, maxval=1.0)
-    left_ctx_len = left_ratio * span_lens_float * (mask_alpha - 1)
-
-    left_ctx_len = round_to_int(left_ctx_len)
-    right_offset = round_to_int(span_lens_float * mask_alpha) - left_ctx_len
-
-    begin_indices = (
-        tf.cumsum(left_ctx_len) + tf.cumsum(right_offset, exclusive=True))
-    end_indices = begin_indices + span_lens
-
-    # Remove out of range indices
-    max_boundary_index = tf.cast(tf.shape(boundary)[0] - 1, tf.int32)
-    valid_idx_mask = end_indices < max_boundary_index
-    begin_indices = tf.boolean_mask(begin_indices, valid_idx_mask)
-    end_indices = tf.boolean_mask(end_indices, valid_idx_mask)
-
-    begin_indices = tf.gather(boundary, begin_indices)
-    end_indices = tf.gather(boundary, end_indices)
-
-    # Shuffle valid indices
-    num_valid = tf.cast(tf.shape(begin_indices)[0], tf.int32)
-    order = tf.random.shuffle(tf.range(num_valid, dtype=tf.int32))
-    begin_indices = tf.gather(begin_indices, order)
-    end_indices = tf.gather(end_indices, order)
-
-    return self._index_pair_to_mask(
-        begin_indices=begin_indices,
-        end_indices=end_indices,
-        inputs=inputs)
-
-  def _online_sample_mask(self,
-                          inputs: tf.Tensor,
-                          boundary: tf.Tensor) -> tf.Tensor:
-    """Samples target positions for predictions.
-
-    Descriptions of each strategy:
-      - 'single_token': Samples individual tokens as prediction targets.
-      - 'token_span': Samples spans of tokens as prediction targets.
-      - 'whole_word': Samples individual words as prediction targets.
-      - 'word_span': Samples spans of words as prediction targets.
-
-    Args:
-      inputs: The input tokens.
-      boundary: The `int` Tensor of indices indicating whole word boundaries.
-        This is used in 'whole_word' and 'word_span'
-
-    Returns:
-      The sampled `bool` input mask.
-
-    Raises:
-      `ValueError`: if `max_predictions_per_seq` is not set or if boundary is
-        not provided for 'whole_word' and 'word_span' sample strategies.
-    """
-    if self._max_predictions_per_seq is None:
-      raise ValueError('`max_predictions_per_seq` must be set.')
-
-    if boundary is None and 'word' in self._sample_strategy:
-      raise ValueError('`boundary` must be provided for {} strategy'.format(
-          self._sample_strategy))
-
-    if self._sample_strategy == 'single_token':
-      return self._single_token_mask(inputs)
-    elif self._sample_strategy == 'token_span':
-      return self._token_span_mask(inputs)
-    elif self._sample_strategy == 'whole_word':
-      return self._whole_word_mask(inputs, boundary)
-    elif self._sample_strategy == 'word_span':
-      return self._word_span_mask(inputs, boundary)
-    else:
-      raise NotImplementedError('Invalid sample strategy.')
-
-  def _get_factorization(self,
-                         inputs: tf.Tensor,
-                         input_mask: tf.Tensor):
-    """Samples a permutation of the factorization order.
-
-    Args:
-      inputs: the input tokens.
-      input_mask: the `bool` Tensor of the same shape as `inputs`.
-        If `True`, then this means select for partial prediction.
-
-    Returns:
-      perm_mask: An `int32` Tensor of shape [seq_length, seq_length] consisting
-        of 0s and 1s. If perm_mask[i][j] == 0, then this means that the i-th
-        token (in original order) cannot attend to the jth attention token.
-      target_mask: An `int32` Tensor of shape [seq_len] consisting of 0s and 1s.
-        If target_mask[i] == 1, then the i-th token needs to be predicted and
-        the mask will be used as input. This token will be included in the loss.
-        If target_mask[i] == 0, then the token (or [SEP], [CLS]) will be used as
-        input. This token will not be included in the loss.
-      tokens: int32 Tensor of shape [seq_length].
-      masked_tokens: int32 Tensor of shape [seq_length].
-
-    """
-    factorization_length = tf.shape(inputs)[0]
-    # Generate permutation indices
-    index = tf.range(factorization_length, dtype=tf.int32)
-    index = tf.transpose(tf.reshape(index, [-1, self._permutation_size]))
-    index = tf.random.shuffle(index)
-    index = tf.reshape(tf.transpose(index), [-1])
-
-    input_mask = tf.cast(input_mask, tf.bool)
-
-    # non-functional tokens
-    non_func_tokens = tf.logical_not(
-        tf.logical_or(
-            tf.equal(inputs, self._sep_id), tf.equal(inputs, self._cls_id)))
-    masked_tokens = tf.logical_and(input_mask, non_func_tokens)
-    non_masked_or_func_tokens = tf.logical_not(masked_tokens)
-
-    smallest_index = -2 * tf.ones([factorization_length], dtype=tf.int32)
-
-    # Similar to BERT, randomly leak some masked tokens
-    if self._leak_ratio > 0:
-      leak_tokens = tf.logical_and(
-          masked_tokens,
-          tf.random.uniform([factorization_length],
-                            maxval=1.0) < self._leak_ratio)
-      can_attend_self = tf.logical_or(non_masked_or_func_tokens, leak_tokens)
-    else:
-      can_attend_self = non_masked_or_func_tokens
-    to_index = tf.where(can_attend_self, smallest_index, index)
-    from_index = tf.where(can_attend_self, to_index + 1, to_index)
-
-    # For masked tokens, can attend if i > j
-    # For context tokens, always can attend each other
-    can_attend = from_index[:, None] > to_index[None, :]
-
-    perm_mask = tf.cast(can_attend, tf.int32)
-
-    # Only masked tokens are included in the loss
-    target_mask = tf.cast(masked_tokens, tf.int32)
-
-    return perm_mask, target_mask, inputs, masked_tokens
-
-  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
-    """Returns a tf.dataset.Dataset."""
-    if input_context:
-      self._num_replicas_in_sync = input_context.num_replicas_in_sync
-    reader = input_reader.InputReader(
-        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
-    return reader.read(input_context)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dataloader_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dataloader_test.py
deleted file mode 100644
index 826cce8845f2af4d8b48e19d384ae99f52a69d0a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dataloader_test.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.data.pretrain_dataloader."""
-import itertools
-import os
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.data import pretrain_dataloader
-
-
-def create_int_feature(values):
-  f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-  return f
-
-
-def _create_fake_bert_dataset(
-    output_path,
-    seq_length,
-    max_predictions_per_seq,
-    use_position_id,
-    use_next_sentence_label,
-    use_v2_feature_names=False):
-  """Creates a fake dataset."""
-  writer = tf.io.TFRecordWriter(output_path)
-
-  def create_float_feature(values):
-    f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
-    return f
-
-  for _ in range(100):
-    features = {}
-    input_ids = np.random.randint(100, size=(seq_length))
-    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
-    if use_v2_feature_names:
-      features["input_word_ids"] = create_int_feature(input_ids)
-      features["input_type_ids"] = create_int_feature(np.ones_like(input_ids))
-    else:
-      features["input_ids"] = create_int_feature(input_ids)
-      features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
-
-    features["masked_lm_positions"] = create_int_feature(
-        np.random.randint(100, size=(max_predictions_per_seq)))
-    features["masked_lm_ids"] = create_int_feature(
-        np.random.randint(100, size=(max_predictions_per_seq)))
-    features["masked_lm_weights"] = create_float_feature(
-        [1.0] * max_predictions_per_seq)
-
-    if use_next_sentence_label:
-      features["next_sentence_labels"] = create_int_feature([1])
-
-    if use_position_id:
-      features["position_ids"] = create_int_feature(range(0, seq_length))
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-
-
-def _create_fake_xlnet_dataset(
-    output_path, seq_length, max_predictions_per_seq):
-  """Creates a fake dataset."""
-  writer = tf.io.TFRecordWriter(output_path)
-  for _ in range(100):
-    features = {}
-    input_ids = np.random.randint(100, size=(seq_length))
-    num_boundary_indices = np.random.randint(1, seq_length)
-
-    if max_predictions_per_seq is not None:
-      input_mask = np.zeros_like(input_ids)
-      input_mask[:max_predictions_per_seq] = 1
-      np.random.shuffle(input_mask)
-    else:
-      input_mask = np.ones_like(input_ids)
-
-    features["input_mask"] = create_int_feature(input_mask)
-    features["input_word_ids"] = create_int_feature(input_ids)
-    features["input_type_ids"] = create_int_feature(np.ones_like(input_ids))
-    features["boundary_indices"] = create_int_feature(
-        sorted(np.random.randint(seq_length, size=(num_boundary_indices))))
-    features["target"] = create_int_feature(input_ids + 1)
-    features["label"] = create_int_feature([1])
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-
-
-class BertPretrainDataTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters(itertools.product(
-      (False, True),
-      (False, True),
-  ))
-  def test_load_data(self, use_next_sentence_label, use_position_id):
-    train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record")
-    seq_length = 128
-    max_predictions_per_seq = 20
-    _create_fake_bert_dataset(
-        train_data_path,
-        seq_length,
-        max_predictions_per_seq,
-        use_next_sentence_label=use_next_sentence_label,
-        use_position_id=use_position_id)
-    data_config = pretrain_dataloader.BertPretrainDataConfig(
-        input_path=train_data_path,
-        max_predictions_per_seq=max_predictions_per_seq,
-        seq_length=seq_length,
-        global_batch_size=10,
-        is_training=True,
-        use_next_sentence_label=use_next_sentence_label,
-        use_position_id=use_position_id)
-
-    dataset = pretrain_dataloader.BertPretrainDataLoader(data_config).load()
-    features = next(iter(dataset))
-    self.assertLen(features,
-                   6 + int(use_next_sentence_label) + int(use_position_id))
-    self.assertIn("input_word_ids", features)
-    self.assertIn("input_mask", features)
-    self.assertIn("input_type_ids", features)
-    self.assertIn("masked_lm_positions", features)
-    self.assertIn("masked_lm_ids", features)
-    self.assertIn("masked_lm_weights", features)
-
-    self.assertEqual("next_sentence_labels" in features,
-                     use_next_sentence_label)
-    self.assertEqual("position_ids" in features, use_position_id)
-
-  def test_v2_feature_names(self):
-    train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record")
-    seq_length = 128
-    max_predictions_per_seq = 20
-    _create_fake_bert_dataset(
-        train_data_path,
-        seq_length,
-        max_predictions_per_seq,
-        use_next_sentence_label=True,
-        use_position_id=False,
-        use_v2_feature_names=True)
-    data_config = pretrain_dataloader.BertPretrainDataConfig(
-        input_path=train_data_path,
-        max_predictions_per_seq=max_predictions_per_seq,
-        seq_length=seq_length,
-        global_batch_size=10,
-        is_training=True,
-        use_next_sentence_label=True,
-        use_position_id=False,
-        use_v2_feature_names=True)
-
-    dataset = pretrain_dataloader.BertPretrainDataLoader(data_config).load()
-    features = next(iter(dataset))
-    self.assertIn("input_word_ids", features)
-    self.assertIn("input_mask", features)
-    self.assertIn("input_type_ids", features)
-    self.assertIn("masked_lm_positions", features)
-    self.assertIn("masked_lm_ids", features)
-    self.assertIn("masked_lm_weights", features)
-
-
-class XLNetPretrainDataTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(itertools.product(
-      ("single_token", "whole_word", "token_span"),
-      (0, 64),
-      (20, None),
-      ))
-  def test_load_data(
-      self, sample_strategy, reuse_length, max_predictions_per_seq):
-    train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record")
-    seq_length = 128
-    batch_size = 5
-
-    _create_fake_xlnet_dataset(
-        train_data_path, seq_length, max_predictions_per_seq)
-
-    data_config = pretrain_dataloader.XLNetPretrainDataConfig(
-        input_path=train_data_path,
-        max_predictions_per_seq=max_predictions_per_seq,
-        seq_length=seq_length,
-        global_batch_size=batch_size,
-        is_training=True,
-        reuse_length=reuse_length,
-        sample_strategy=sample_strategy,
-        min_num_tokens=1,
-        max_num_tokens=2,
-        permutation_size=seq_length // 2,
-        leak_ratio=0.1)
-
-    if max_predictions_per_seq is None:
-      with self.assertRaises(ValueError):
-        dataset = pretrain_dataloader.XLNetPretrainDataLoader(
-            data_config).load()
-        features = next(iter(dataset))
-    else:
-      dataset = pretrain_dataloader.XLNetPretrainDataLoader(data_config).load()
-      features = next(iter(dataset))
-
-      self.assertIn("input_word_ids", features)
-      self.assertIn("input_type_ids", features)
-      self.assertIn("permutation_mask", features)
-      self.assertIn("masked_tokens", features)
-      self.assertIn("target", features)
-      self.assertIn("target_mask", features)
-
-      self.assertAllClose(features["input_word_ids"].shape,
-                          (batch_size, seq_length))
-      self.assertAllClose(features["input_type_ids"].shape,
-                          (batch_size, seq_length))
-      self.assertAllClose(features["permutation_mask"].shape,
-                          (batch_size, seq_length, seq_length))
-      self.assertAllClose(features["masked_tokens"].shape,
-                          (batch_size, seq_length,))
-      if max_predictions_per_seq is not None:
-        self.assertIn("target_mapping", features)
-        self.assertAllClose(features["target_mapping"].shape,
-                            (batch_size, max_predictions_per_seq, seq_length))
-        self.assertAllClose(features["target_mask"].shape,
-                            (batch_size, max_predictions_per_seq))
-        self.assertAllClose(features["target"].shape,
-                            (batch_size, max_predictions_per_seq))
-      else:
-        self.assertAllClose(features["target_mask"].shape,
-                            (batch_size, seq_length))
-        self.assertAllClose(features["target"].shape,
-                            (batch_size, seq_length))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dynamic_dataloader.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dynamic_dataloader.py
deleted file mode 100644
index bde8e15b0e102e184c99d2681b40fe96799381d8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dynamic_dataloader.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Dataset loader for the pre-training with dynamic sequence length."""
-from typing import Optional, Tuple
-
-import dataclasses
-import tensorflow as tf
-
-from official.core import config_definitions as cfg
-from official.core import input_reader
-from official.nlp.data import data_loader_factory
-from official.nlp.data import pretrain_dataloader
-
-
-@dataclasses.dataclass
-class BertPretrainDataConfig(cfg.DataConfig):
-  """Data config for BERT pretraining task (tasks/masked_lm)."""
-  input_path: str = ''
-  global_batch_size: int = 512
-  is_training: bool = True
-  seq_bucket_lengths: Tuple[int, ...] = (128, 256, 384, 512,)
-  # TODO(rxsang): `seq_bucket_window_scale` is only useful when round robin
-  # tf.data service is disabled. Deprecate this flag once we always enable round
-  # robin tf.data service.
-  seq_bucket_window_scale: int = 8
-  use_next_sentence_label: bool = True
-  use_position_id: bool = False
-  deterministic: bool = False
-  enable_tf_data_service: bool = False
-  enable_round_robin_tf_data_service: bool = False
-  tf_data_service_job_name: str = 'bert_pretrain'
-  use_v2_feature_names: bool = False
-
-
-@data_loader_factory.register_data_loader_cls(BertPretrainDataConfig)
-class PretrainingDynamicDataLoader(pretrain_dataloader.BertPretrainDataLoader):
-  """Dataset loader for bert-style pretraining with dynamic sequenece length.
-
-  Bucketizes the input id features by the seq_bucket_lengths and features are
-  padded to the bucket boundaries. The mask features are usually short than
-  input id features and can also be dynamic. We require the mask feature lengths
-  within a bucket must be the same. For example, with [128, 256] buckets,
-  the mask features for bucket 128 should always have the length as X and
-  features for bucket 256 should always have the length as Y.
-
-  The dataloader does not filter out empty masks. Make sure to handle this
-  in the model.
-  """
-
-  def __init__(self, params):
-    self._params = params
-    if len(params.seq_bucket_lengths) < 1:
-      raise ValueError('The seq_bucket_lengths cannot be empty.')
-    self._seq_bucket_lengths = params.seq_bucket_lengths
-    self._seq_bucket_window_scale = params.seq_bucket_window_scale
-    self._global_batch_size = params.global_batch_size
-    self._use_next_sentence_label = params.use_next_sentence_label
-    self._use_position_id = params.use_position_id
-    self._drop_remainder = params.drop_remainder
-    self._enable_tf_data_service = params.enable_tf_data_service
-    self._enable_round_robin_tf_data_service = (
-        params.enable_round_robin_tf_data_service)
-    self._mask_keys = [
-        'masked_lm_positions', 'masked_lm_ids', 'masked_lm_weights'
-    ]
-
-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
-    name_to_features = {
-        'input_ids': tf.io.VarLenFeature(tf.int64),
-        'input_mask': tf.io.VarLenFeature(tf.int64),
-        'segment_ids': tf.io.VarLenFeature(tf.int64),
-        'masked_lm_positions': tf.io.VarLenFeature(tf.int64),
-        'masked_lm_ids': tf.io.VarLenFeature(tf.int64),
-        'masked_lm_weights': tf.io.VarLenFeature(tf.float32),
-    }
-    if self._use_next_sentence_label:
-      name_to_features['next_sentence_labels'] = tf.io.FixedLenFeature([1],
-                                                                       tf.int64)
-    dynamic_keys = ['input_ids', 'input_mask', 'segment_ids']
-    if self._use_position_id:
-      name_to_features['position_ids'] = tf.io.VarLenFeature(tf.int64)
-      dynamic_keys.append('position_ids')
-
-    example = tf.io.parse_single_example(record, name_to_features)
-    for key in dynamic_keys + self._mask_keys:
-      example[key] = tf.sparse.to_dense(example[key])
-
-    # Truncate padded data after the first non pad in the
-    # sequence length dimension.
-    # Pad before the first non pad from the back should not be removed.
-    mask = tf.math.greater(
-        tf.math.cumsum(example['input_ids'], reverse=True), 0)
-    for key in dynamic_keys:
-      example[key] = tf.boolean_mask(example[key], mask)
-
-    # masked_lm_ids should be 0 padded.
-    # Change mask features to -1 padding so that we can differentiate
-    # padding from data or from bucketizing.
-    mask = tf.math.not_equal(example['masked_lm_ids'], 0)
-    example['masked_lm_ids'] = tf.where(
-        mask, example['masked_lm_ids'],
-        -tf.ones(tf.shape(example['masked_lm_ids']), dtype=example[key].dtype))
-
-    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-    # So cast all int64 to int32.
-    # tf.data service uses dataset graph fingerprint to distinguish input
-    # pipeline jobs, thus we sort the keys here to make sure they are generated
-    # in a deterministic order each time the dataset function is traced.
-    for name in sorted(list(example.keys())):
-      t = example[name]
-      if t.dtype == tf.int64:
-        t = tf.cast(t, tf.int32)
-      example[name] = t
-
-    return example
-
-  def _bucketize_and_batch(
-      self,
-      dataset,
-      input_context: Optional[tf.distribute.InputContext] = None):
-    """Bucketize by sequence length and batch the datasets."""
-    per_replica_batch_size = input_context.get_per_replica_batch_size(
-        self._global_batch_size) if input_context else self._global_batch_size
-
-    def element_length_func(example, seq_len_dim):
-      return tf.shape(example['input_word_ids'])[seq_len_dim]
-
-    bucket_boundaries = [length + 1 for length in self._seq_bucket_lengths]
-    bucket_batch_sizes = [per_replica_batch_size] * (len(bucket_boundaries) + 1)
-
-    # Bucketize and batch the dataset with per replica batch size first.
-    dataset = dataset.apply(
-        tf.data.experimental.bucket_by_sequence_length(
-            lambda example: tf.cast(element_length_func(example, 0), tf.int32),
-            bucket_boundaries,
-            bucket_batch_sizes,
-            pad_to_bucket_boundary=True,
-            drop_remainder=self._drop_remainder))
-    if input_context:
-      window_size = input_context.num_replicas_in_sync
-      if self._enable_tf_data_service and (
-          not self._enable_round_robin_tf_data_service):
-        # If tf.data service is enabled but round-robin behavior is not enabled,
-        # different TPU workers may fetch data from one tf.data service worker
-        # in different speed. We set the window size to be
-        # `seq_bucket_window_scale` larger to leave buffer if some workers are
-        # fetching data faster than others, so all the data within the same
-        # global batch can still have more chances to be in the same bucket.
-        window_size *= self._seq_bucket_window_scale
-
-      # Group `num_replicas_in_sync` batches from same bucket together, so all
-      # replicas can get the same sequence length for one global step.
-      dataset = dataset.apply(
-          tf.data.experimental.group_by_window(
-              key_func=lambda example: tf.cast(  # pylint: disable=g-long-lambda
-                  element_length_func(example, 1), tf.int64),
-              reduce_func=lambda _, x: tf.data.Dataset.from_tensors(x),
-              window_size=window_size))
-      dataset = dataset.flat_map(lambda x: x)
-
-    def _remove_pads_from_bucketize(features):
-      # All mask features must have the same effective length.
-      # The real masked ids padding token is -1 and 0 comes from
-      # bucket_by_sequence_length.
-      mask = tf.math.not_equal(features['masked_lm_ids'], 0)
-
-      mask_per_example = tf.math.reduce_sum(tf.cast(mask, tf.int32), axis=1)
-      normalized = tf.cast(
-          mask_per_example / tf.math.reduce_max(mask_per_example), tf.int32)
-      assert_op = tf.debugging.assert_equal(
-          tf.math.reduce_sum(normalized), per_replica_batch_size,
-          'Number of non padded mask tokens is not the same for each example '
-          'in the same sequence length.')
-      with tf.control_dependencies([assert_op]):
-        for key in self._mask_keys:
-          features[key] = tf.reshape(
-              tf.boolean_mask(
-                  features[key], mask), [per_replica_batch_size, -1])
-      # Revert masked_lm_ids to be 0-padded.
-      mask = tf.math.not_equal(features['masked_lm_ids'], -1)
-      features['masked_lm_ids'] = tf.where(
-          mask, features['masked_lm_ids'],
-          tf.zeros(
-              tf.shape(features['masked_lm_ids']),
-              dtype=features['masked_lm_ids'].dtype))
-      return features
-
-    dataset = dataset.map(_remove_pads_from_bucketize)
-    return dataset
-
-  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
-    """Returns a tf.dataset.Dataset."""
-    reader = input_reader.InputReader(
-        params=self._params,
-        decoder_fn=self._decode,
-        parser_fn=self._parse,
-        transform_and_batch_fn=self._bucketize_and_batch)
-    return reader.read(input_context)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dynamic_dataloader_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dynamic_dataloader_test.py
deleted file mode 100644
index 07ab60746b16f934fa29c2ef5434673047260149..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/pretrain_dynamic_dataloader_test.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for nlp.data.pretrain_dynamic_dataloader."""
-import os
-
-from absl import logging
-from absl.testing import parameterized
-import numpy as np
-import orbit
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.nlp.configs import bert
-from official.nlp.configs import encoders
-from official.nlp.data import pretrain_dataloader
-from official.nlp.data import pretrain_dynamic_dataloader
-from official.nlp.tasks import masked_lm
-
-
-def _create_fake_dataset(output_path, seq_length, num_masked_tokens,
-                         max_seq_length, num_examples):
-  """Creates a fake dataset."""
-  writer = tf.io.TFRecordWriter(output_path)
-
-  def create_int_feature(values):
-    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-    return f
-
-  def create_float_feature(values):
-    f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
-    return f
-
-  for _ in range(num_examples):
-    features = {}
-    padding = np.zeros(shape=(max_seq_length - seq_length), dtype=np.int32)
-    input_ids = np.random.randint(low=1, high=100, size=(seq_length))
-    features['input_ids'] = create_int_feature(
-        np.concatenate((input_ids, padding)))
-    features['input_mask'] = create_int_feature(
-        np.concatenate((np.ones_like(input_ids), padding)))
-    features['segment_ids'] = create_int_feature(
-        np.concatenate((np.ones_like(input_ids), padding)))
-    features['position_ids'] = create_int_feature(
-        np.concatenate((np.ones_like(input_ids), padding)))
-    features['masked_lm_positions'] = create_int_feature(
-        np.random.randint(60, size=(num_masked_tokens), dtype=np.int64))
-    features['masked_lm_ids'] = create_int_feature(
-        np.random.randint(100, size=(num_masked_tokens), dtype=np.int64))
-    features['masked_lm_weights'] = create_float_feature(
-        np.ones((num_masked_tokens,), dtype=np.float32))
-    features['next_sentence_labels'] = create_int_feature(np.array([0]))
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-
-
-class PretrainDynamicDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
-
-  @combinations.generate(
-      combinations.combine(
-          distribution_strategy=[
-              strategy_combinations.cloud_tpu_strategy,
-          ],
-          mode='eager'))
-  def test_distribution_strategy(self, distribution_strategy):
-    max_seq_length = 128
-    batch_size = 8
-    input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
-    _create_fake_dataset(
-        input_path,
-        seq_length=60,
-        num_masked_tokens=20,
-        max_seq_length=max_seq_length,
-        num_examples=batch_size)
-    data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig(
-        is_training=False,
-        input_path=input_path,
-        seq_bucket_lengths=[64, 128],
-        global_batch_size=batch_size)
-    dataloader = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader(
-        data_config)
-    distributed_ds = orbit.utils.make_distributed_dataset(
-        distribution_strategy, dataloader.load)
-    train_iter = iter(distributed_ds)
-    with distribution_strategy.scope():
-      config = masked_lm.MaskedLMConfig(
-          init_checkpoint=self.get_temp_dir(),
-          model=bert.PretrainerConfig(
-              encoders.EncoderConfig(
-                  bert=encoders.BertEncoderConfig(
-                      vocab_size=30522, num_layers=1)),
-              cls_heads=[
-                  bert.ClsHeadConfig(
-                      inner_dim=10, num_classes=2, name='next_sentence')
-              ]),
-          train_data=data_config)
-      task = masked_lm.MaskedLMTask(config)
-      model = task.build_model()
-      metrics = task.build_metrics()
-
-    @tf.function
-    def step_fn(features):
-      return task.validation_step(features, model, metrics=metrics)
-
-    distributed_outputs = distribution_strategy.run(
-        step_fn, args=(next(train_iter),))
-    local_results = tf.nest.map_structure(
-        distribution_strategy.experimental_local_results, distributed_outputs)
-    logging.info('Dynamic padding:  local_results= %s', str(local_results))
-    dynamic_metrics = {}
-    for metric in metrics:
-      dynamic_metrics[metric.name] = metric.result()
-
-    data_config = pretrain_dataloader.BertPretrainDataConfig(
-        is_training=False,
-        input_path=input_path,
-        seq_length=max_seq_length,
-        max_predictions_per_seq=20,
-        global_batch_size=batch_size)
-    dataloader = pretrain_dataloader.BertPretrainDataLoader(data_config)
-    distributed_ds = orbit.utils.make_distributed_dataset(
-        distribution_strategy, dataloader.load)
-    train_iter = iter(distributed_ds)
-    with distribution_strategy.scope():
-      metrics = task.build_metrics()
-
-    @tf.function
-    def step_fn_b(features):
-      return task.validation_step(features, model, metrics=metrics)
-
-    distributed_outputs = distribution_strategy.run(
-        step_fn_b, args=(next(train_iter),))
-    local_results = tf.nest.map_structure(
-        distribution_strategy.experimental_local_results, distributed_outputs)
-    logging.info('Static padding:  local_results= %s', str(local_results))
-    static_metrics = {}
-    for metric in metrics:
-      static_metrics[metric.name] = metric.result()
-    for key in static_metrics:
-      # We need to investigate the differences on losses.
-      if key != 'next_sentence_loss':
-        self.assertEqual(dynamic_metrics[key], static_metrics[key])
-
-  def test_load_dataset(self):
-    max_seq_length = 128
-    batch_size = 2
-    input_path_1 = os.path.join(self.get_temp_dir(), 'train_1.tf_record')
-    _create_fake_dataset(
-        input_path_1,
-        seq_length=60,
-        num_masked_tokens=20,
-        max_seq_length=max_seq_length,
-        num_examples=batch_size)
-    input_path_2 = os.path.join(self.get_temp_dir(), 'train_2.tf_record')
-    _create_fake_dataset(
-        input_path_2,
-        seq_length=100,
-        num_masked_tokens=70,
-        max_seq_length=max_seq_length,
-        num_examples=batch_size)
-    input_paths = ','.join([input_path_1, input_path_2])
-    data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig(
-        is_training=False,
-        input_path=input_paths,
-        seq_bucket_lengths=[64, 128],
-        use_position_id=True,
-        global_batch_size=batch_size)
-    dataset = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader(
-        data_config).load()
-    dataset_it = iter(dataset)
-    features = next(dataset_it)
-    self.assertCountEqual([
-        'input_word_ids',
-        'input_mask',
-        'input_type_ids',
-        'next_sentence_labels',
-        'masked_lm_positions',
-        'masked_lm_ids',
-        'masked_lm_weights',
-        'position_ids',
-    ], features.keys())
-    # Sequence length dimension should be bucketized and pad to 64.
-    self.assertEqual(features['input_word_ids'].shape, (batch_size, 64))
-    self.assertEqual(features['input_mask'].shape, (batch_size, 64))
-    self.assertEqual(features['input_type_ids'].shape, (batch_size, 64))
-    self.assertEqual(features['position_ids'].shape, (batch_size, 64))
-    self.assertEqual(features['masked_lm_positions'].shape, (batch_size, 20))
-    features = next(dataset_it)
-    self.assertEqual(features['input_word_ids'].shape, (batch_size, 128))
-    self.assertEqual(features['input_mask'].shape, (batch_size, 128))
-    self.assertEqual(features['input_type_ids'].shape, (batch_size, 128))
-    self.assertEqual(features['position_ids'].shape, (batch_size, 128))
-    self.assertEqual(features['masked_lm_positions'].shape, (batch_size, 70))
-
-  def test_load_dataset_not_same_masks(self):
-    max_seq_length = 128
-    batch_size = 2
-    input_path_1 = os.path.join(self.get_temp_dir(), 'train_3.tf_record')
-    _create_fake_dataset(
-        input_path_1,
-        seq_length=60,
-        num_masked_tokens=20,
-        max_seq_length=max_seq_length,
-        num_examples=batch_size)
-    input_path_2 = os.path.join(self.get_temp_dir(), 'train_4.tf_record')
-    _create_fake_dataset(
-        input_path_2,
-        seq_length=60,
-        num_masked_tokens=15,
-        max_seq_length=max_seq_length,
-        num_examples=batch_size)
-    input_paths = ','.join([input_path_1, input_path_2])
-    data_config = pretrain_dynamic_dataloader.BertPretrainDataConfig(
-        is_training=False,
-        input_path=input_paths,
-        seq_bucket_lengths=[64, 128],
-        use_position_id=True,
-        global_batch_size=batch_size * 2)
-    dataset = pretrain_dynamic_dataloader.PretrainingDynamicDataLoader(
-        data_config).load()
-    dataset_it = iter(dataset)
-    with self.assertRaisesRegex(
-        tf.errors.InvalidArgumentError, '.*Number of non padded mask tokens.*'):
-      next(dataset_it)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/question_answering_dataloader.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/question_answering_dataloader.py
deleted file mode 100644
index 5798e2386707b0ef4a3fef47b36d45ec2ac49b39..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/question_answering_dataloader.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Loads dataset for the question answering (e.g, SQuAD) task."""
-from typing import Mapping, Optional
-
-import dataclasses
-import tensorflow as tf
-from official.core import config_definitions as cfg
-from official.core import input_reader
-from official.nlp.data import data_loader
-from official.nlp.data import data_loader_factory
-
-
-@dataclasses.dataclass
-class QADataConfig(cfg.DataConfig):
-  """Data config for question answering task (tasks/question_answering)."""
-  # For training, `input_path` is expected to be a pre-processed TFRecord file,
-  # while for evaluation, it is expected to be a raw JSON file (b/173814590).
-  input_path: str = ''
-  global_batch_size: int = 48
-  is_training: bool = True
-  seq_length: int = 384
-  # Settings below are question answering specific.
-  version_2_with_negative: bool = False
-  # Settings below are only used for eval mode.
-  input_preprocessed_data_path: str = ''
-  doc_stride: int = 128
-  query_length: int = 64
-  # The path to the vocab file of word piece tokenizer or the
-  # model of the sentence piece tokenizer.
-  vocab_file: str = ''
-  tokenization: str = 'WordPiece'  # WordPiece or SentencePiece
-  do_lower_case: bool = True
-  xlnet_format: bool = False
-
-
-@data_loader_factory.register_data_loader_cls(QADataConfig)
-class QuestionAnsweringDataLoader(data_loader.DataLoader):
-  """A class to load dataset for sentence prediction (classification) task."""
-
-  def __init__(self, params):
-    self._params = params
-    self._seq_length = params.seq_length
-    self._is_training = params.is_training
-    self._xlnet_format = params.xlnet_format
-
-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
-    name_to_features = {
-        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-    }
-    if self._xlnet_format:
-      name_to_features['class_index'] = tf.io.FixedLenFeature([], tf.int64)
-      name_to_features['paragraph_mask'] = tf.io.FixedLenFeature(
-          [self._seq_length], tf.int64)
-      if self._is_training:
-        name_to_features['is_impossible'] = tf.io.FixedLenFeature([], tf.int64)
-
-    if self._is_training:
-      name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
-      name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
-    else:
-      name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
-    example = tf.io.parse_single_example(record, name_to_features)
-
-    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-    # So cast all int64 to int32.
-    for name in example:
-      t = example[name]
-      if t.dtype == tf.int64:
-        t = tf.cast(t, tf.int32)
-      example[name] = t
-
-    return example
-
-  def _parse(self, record: Mapping[str, tf.Tensor]):
-    """Parses raw tensors into a dict of tensors to be consumed by the model."""
-    x, y = {}, {}
-    for name, tensor in record.items():
-      if name in ('start_positions', 'end_positions', 'is_impossible'):
-        y[name] = tensor
-      elif name == 'input_ids':
-        x['input_word_ids'] = tensor
-      elif name == 'segment_ids':
-        x['input_type_ids'] = tensor
-      else:
-        x[name] = tensor
-      if name == 'start_positions' and self._xlnet_format:
-        x[name] = tensor
-    return (x, y)
-
-  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
-    """Returns a tf.dataset.Dataset."""
-    reader = input_reader.InputReader(
-        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
-    return reader.read(input_context)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/question_answering_dataloader_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/question_answering_dataloader_test.py
deleted file mode 100644
index d2fed7107123ace020891225856750f47ec26cdd..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/question_answering_dataloader_test.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.data.question_answering_dataloader."""
-import os
-
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.data import question_answering_dataloader
-
-
-def _create_fake_dataset(output_path, seq_length):
-  """Creates a fake dataset."""
-  writer = tf.io.TFRecordWriter(output_path)
-
-  def create_int_feature(values):
-    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-    return f
-
-  for _ in range(100):
-    features = {}
-    input_ids = np.random.randint(100, size=(seq_length))
-    features['input_ids'] = create_int_feature(input_ids)
-    features['input_mask'] = create_int_feature(np.ones_like(input_ids))
-    features['segment_ids'] = create_int_feature(np.ones_like(input_ids))
-    features['start_positions'] = create_int_feature(np.array([0]))
-    features['end_positions'] = create_int_feature(np.array([10]))
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-
-
-class QuestionAnsweringDataTest(tf.test.TestCase):
-
-  def test_load_dataset(self):
-    seq_length = 128
-    batch_size = 10
-    input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
-    _create_fake_dataset(input_path, seq_length)
-    data_config = question_answering_dataloader.QADataConfig(
-        is_training=True,
-        input_path=input_path,
-        seq_length=seq_length,
-        global_batch_size=batch_size)
-    dataset = question_answering_dataloader.QuestionAnsweringDataLoader(
-        data_config).load()
-    features, labels = next(iter(dataset))
-
-    self.assertCountEqual(['input_word_ids', 'input_mask', 'input_type_ids'],
-                          features.keys())
-    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-
-    self.assertCountEqual(['start_positions', 'end_positions'], labels.keys())
-    self.assertEqual(labels['start_positions'].shape, (batch_size,))
-    self.assertEqual(labels['end_positions'].shape, (batch_size,))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/sentence_prediction_dataloader.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/sentence_prediction_dataloader.py
deleted file mode 100644
index 3c01e79e4aee26fc1005fb195067d9ea5066512d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/sentence_prediction_dataloader.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Loads dataset for the sentence prediction (classification) task."""
-import functools
-from typing import List, Mapping, Optional
-
-import dataclasses
-import tensorflow as tf
-import tensorflow_hub as hub
-
-from official.common import dataset_fn
-from official.core import config_definitions as cfg
-from official.core import input_reader
-from official.nlp import modeling
-from official.nlp.data import data_loader
-from official.nlp.data import data_loader_factory
-
-LABEL_TYPES_MAP = {'int': tf.int64, 'float': tf.float32}
-
-
-@dataclasses.dataclass
-class SentencePredictionDataConfig(cfg.DataConfig):
-  """Data config for sentence prediction task (tasks/sentence_prediction)."""
-  input_path: str = ''
-  global_batch_size: int = 32
-  is_training: bool = True
-  seq_length: int = 128
-  label_type: str = 'int'
-  # Whether to include the example id number.
-  include_example_id: bool = False
-
-
-@data_loader_factory.register_data_loader_cls(SentencePredictionDataConfig)
-class SentencePredictionDataLoader(data_loader.DataLoader):
-  """A class to load dataset for sentence prediction (classification) task."""
-
-  def __init__(self, params):
-    self._params = params
-    self._seq_length = params.seq_length
-    self._include_example_id = params.include_example_id
-
-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
-    label_type = LABEL_TYPES_MAP[self._params.label_type]
-    name_to_features = {
-        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'label_ids': tf.io.FixedLenFeature([], label_type),
-    }
-    if self._include_example_id:
-      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
-
-    example = tf.io.parse_single_example(record, name_to_features)
-
-    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-    # So cast all int64 to int32.
-    for name in example:
-      t = example[name]
-      if t.dtype == tf.int64:
-        t = tf.cast(t, tf.int32)
-      example[name] = t
-
-    return example
-
-  def _parse(self, record: Mapping[str, tf.Tensor]):
-    """Parses raw tensors into a dict of tensors to be consumed by the model."""
-    x = {
-        'input_word_ids': record['input_ids'],
-        'input_mask': record['input_mask'],
-        'input_type_ids': record['segment_ids']
-    }
-    if self._include_example_id:
-      x['example_id'] = record['example_id']
-
-    y = record['label_ids']
-    return (x, y)
-
-  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
-    """Returns a tf.dataset.Dataset."""
-    reader = input_reader.InputReader(
-        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
-    return reader.read(input_context)
-
-
-@dataclasses.dataclass
-class SentencePredictionTextDataConfig(cfg.DataConfig):
-  """Data config for sentence prediction task with raw text."""
-  # Either set `input_path`...
-  input_path: str = ''
-  # Either `int` or `float`.
-  label_type: str = 'int'
-  # ...or `tfds_name` and `tfds_split` to specify input.
-  tfds_name: str = ''
-  tfds_split: str = ''
-  # The name of the text feature fields. The text features will be
-  # concatenated in order.
-  text_fields: Optional[List[str]] = None
-  label_field: str = 'label'
-  global_batch_size: int = 32
-  seq_length: int = 128
-  is_training: bool = True
-  # Either build preprocessing with Python code by specifying these values
-  # for modeling.layers.BertTokenizer()/SentencepieceTokenizer()....
-  tokenization: str = 'WordPiece'  # WordPiece or SentencePiece
-  # Text vocab file if tokenization is WordPiece, or sentencepiece.ModelProto
-  # file if tokenization is SentencePiece.
-  vocab_file: str = ''
-  lower_case: bool = True
-  # ...or load preprocessing from a SavedModel at this location.
-  preprocessing_hub_module_url: str = ''
-  # Either tfrecord or sstsable or recordio.
-  file_type: str = 'tfrecord'
-  include_example_id: bool = False
-
-
-class TextProcessor(tf.Module):
-  """Text features processing for sentence prediction task."""
-
-  def __init__(self,
-               seq_length: int,
-               vocab_file: Optional[str] = None,
-               tokenization: Optional[str] = None,
-               lower_case: Optional[bool] = True,
-               preprocessing_hub_module_url: Optional[str] = None):
-    if preprocessing_hub_module_url:
-      self._preprocessing_hub_module = hub.load(preprocessing_hub_module_url)
-      self._tokenizer = self._preprocessing_hub_module.tokenize
-      self._pack_inputs = functools.partial(
-          self._preprocessing_hub_module.bert_pack_inputs,
-          seq_length=seq_length)
-      return
-
-    if tokenization == 'WordPiece':
-      self._tokenizer = modeling.layers.BertTokenizer(
-          vocab_file=vocab_file, lower_case=lower_case)
-    elif tokenization == 'SentencePiece':
-      self._tokenizer = modeling.layers.SentencepieceTokenizer(
-          model_file_path=vocab_file, lower_case=lower_case,
-          strip_diacritics=True)  # Strip diacritics to follow ALBERT model
-    else:
-      raise ValueError('Unsupported tokenization: %s' % tokenization)
-
-    self._pack_inputs = modeling.layers.BertPackInputs(
-        seq_length=seq_length,
-        special_tokens_dict=self._tokenizer.get_special_tokens_dict())
-
-  def __call__(self, segments):
-    segments = [self._tokenizer(s) for s in segments]
-    # BertTokenizer returns a RaggedTensor with shape [batch, word, subword],
-    # and SentencepieceTokenizer returns a RaggedTensor with shape
-    # [batch, sentencepiece],
-    segments = [
-        tf.cast(x.merge_dims(1, -1) if x.shape.rank > 2 else x, tf.int32)
-        for x in segments
-    ]
-    return self._pack_inputs(segments)
-
-
-@data_loader_factory.register_data_loader_cls(SentencePredictionTextDataConfig)
-class SentencePredictionTextDataLoader(data_loader.DataLoader):
-  """Loads dataset with raw text for sentence prediction task."""
-
-  def __init__(self, params):
-    if bool(params.tfds_name) != bool(params.tfds_split):
-      raise ValueError('`tfds_name` and `tfds_split` should be specified or '
-                       'unspecified at the same time.')
-    if bool(params.tfds_name) == bool(params.input_path):
-      raise ValueError('Must specify either `tfds_name` and `tfds_split` '
-                       'or `input_path`.')
-    if not params.text_fields:
-      raise ValueError('Unexpected empty text fields.')
-    if bool(params.vocab_file) == bool(params.preprocessing_hub_module_url):
-      raise ValueError('Must specify exactly one of vocab_file (with matching '
-                       'lower_case flag) or preprocessing_hub_module_url.')
-
-    self._params = params
-    self._text_fields = params.text_fields
-    self._label_field = params.label_field
-    self._label_type = params.label_type
-    self._include_example_id = params.include_example_id
-    self._text_processor = TextProcessor(
-        seq_length=params.seq_length,
-        vocab_file=params.vocab_file,
-        tokenization=params.tokenization,
-        lower_case=params.lower_case,
-        preprocessing_hub_module_url=params.preprocessing_hub_module_url)
-
-  def _bert_preprocess(self, record: Mapping[str, tf.Tensor]):
-    """Berts preprocess."""
-    segments = [record[x] for x in self._text_fields]
-    model_inputs = self._text_processor(segments)
-    if self._include_example_id:
-      model_inputs['example_id'] = record['example_id']
-    y = record[self._label_field]
-    return model_inputs, y
-
-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
-    name_to_features = {}
-    for text_field in self._text_fields:
-      name_to_features[text_field] = tf.io.FixedLenFeature([], tf.string)
-
-    label_type = LABEL_TYPES_MAP[self._label_type]
-    name_to_features[self._label_field] = tf.io.FixedLenFeature([], label_type)
-    if self._include_example_id:
-      name_to_features['example_id'] = tf.io.FixedLenFeature([], tf.int64)
-    example = tf.io.parse_single_example(record, name_to_features)
-
-    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-    # So cast all int64 to int32.
-    for name in example:
-      t = example[name]
-      if t.dtype == tf.int64:
-        t = tf.cast(t, tf.int32)
-      example[name] = t
-
-    return example
-
-  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
-    """Returns a tf.dataset.Dataset."""
-    reader = input_reader.InputReader(
-        dataset_fn=dataset_fn.pick_dataset_fn(self._params.file_type),
-        decoder_fn=self._decode if self._params.input_path else None,
-        params=self._params,
-        postprocess_fn=self._bert_preprocess)
-    return reader.read(input_context)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/sentence_prediction_dataloader_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/sentence_prediction_dataloader_test.py
deleted file mode 100644
index e70261e6af96fc8e7987fca09a1cfd8dadaa974c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/sentence_prediction_dataloader_test.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.data.sentence_prediction_dataloader."""
-import os
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from sentencepiece import SentencePieceTrainer
-from official.nlp.data import sentence_prediction_dataloader as loader
-
-
-def _create_fake_preprocessed_dataset(output_path, seq_length, label_type):
-  """Creates a fake dataset."""
-  writer = tf.io.TFRecordWriter(output_path)
-
-  def create_int_feature(values):
-    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-    return f
-
-  def create_float_feature(values):
-    f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
-    return f
-
-  for _ in range(100):
-    features = {}
-    input_ids = np.random.randint(100, size=(seq_length))
-    features['input_ids'] = create_int_feature(input_ids)
-    features['input_mask'] = create_int_feature(np.ones_like(input_ids))
-    features['segment_ids'] = create_int_feature(np.ones_like(input_ids))
-
-    if label_type == 'int':
-      features['label_ids'] = create_int_feature([1])
-    elif label_type == 'float':
-      features['label_ids'] = create_float_feature([0.5])
-    else:
-      raise ValueError('Unsupported label_type: %s' % label_type)
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-
-
-def _create_fake_raw_dataset(output_path, text_fields, label_type):
-  """Creates a fake tf record file."""
-  writer = tf.io.TFRecordWriter(output_path)
-
-  def create_str_feature(value):
-    f = tf.train.Feature(bytes_list=tf.train.BytesList(value=value))
-    return f
-
-  def create_int_feature(values):
-    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-    return f
-
-  def create_float_feature(values):
-    f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
-    return f
-
-  for _ in range(100):
-    features = {}
-    for text_field in text_fields:
-      features[text_field] = create_str_feature([b'hello world'])
-
-    if label_type == 'int':
-      features['label'] = create_int_feature([0])
-    elif label_type == 'float':
-      features['label'] = create_float_feature([0.5])
-    else:
-      raise ValueError('Unexpected label_type: %s' % label_type)
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-
-
-def _create_fake_sentencepiece_model(output_dir):
-  vocab = ['a', 'b', 'c', 'd', 'e', 'abc', 'def', 'ABC', 'DEF']
-  model_prefix = os.path.join(output_dir, 'spm_model')
-  input_text_file_path = os.path.join(output_dir, 'train_input.txt')
-  with tf.io.gfile.GFile(input_text_file_path, 'w') as f:
-    f.write(' '.join(vocab + ['\n']))
-  # Add 7 more tokens: <pad>, <unk>, [CLS], [SEP], [MASK], <s>, </s>.
-  full_vocab_size = len(vocab) + 7
-  flags = dict(
-      model_prefix=model_prefix,
-      model_type='word',
-      input=input_text_file_path,
-      pad_id=0,
-      unk_id=1,
-      control_symbols='[CLS],[SEP],[MASK]',
-      vocab_size=full_vocab_size,
-      bos_id=full_vocab_size - 2,
-      eos_id=full_vocab_size - 1)
-  SentencePieceTrainer.Train(' '.join(
-      ['--{}={}'.format(k, v) for k, v in flags.items()]))
-  return model_prefix + '.model'
-
-
-def _create_fake_vocab_file(vocab_file_path):
-  tokens = ['[PAD]']
-  for i in range(1, 100):
-    tokens.append('[unused%d]' % i)
-  tokens.extend(['[UNK]', '[CLS]', '[SEP]', '[MASK]', 'hello', 'world'])
-  with tf.io.gfile.GFile(vocab_file_path, 'w') as outfile:
-    outfile.write('\n'.join(tokens))
-
-
-class SentencePredictionDataTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters(('int', tf.int32), ('float', tf.float32))
-  def test_load_dataset(self, label_type, expected_label_type):
-    input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
-    batch_size = 10
-    seq_length = 128
-    _create_fake_preprocessed_dataset(input_path, seq_length, label_type)
-    data_config = loader.SentencePredictionDataConfig(
-        input_path=input_path,
-        seq_length=seq_length,
-        global_batch_size=batch_size,
-        label_type=label_type)
-    dataset = loader.SentencePredictionDataLoader(data_config).load()
-    features, labels = next(iter(dataset))
-    self.assertCountEqual(['input_word_ids', 'input_mask', 'input_type_ids'],
-                          features.keys())
-    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(labels.shape, (batch_size,))
-    self.assertEqual(labels.dtype, expected_label_type)
-
-
-class SentencePredictionTfdsDataLoaderTest(tf.test.TestCase,
-                                           parameterized.TestCase):
-
-  @parameterized.parameters(True, False)
-  def test_python_wordpiece_preprocessing(self, use_tfds):
-    batch_size = 10
-    seq_length = 256  # Non-default value.
-    lower_case = True
-
-    tf_record_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
-    text_fields = ['sentence1', 'sentence2']
-    if not use_tfds:
-      _create_fake_raw_dataset(tf_record_path, text_fields, label_type='int')
-
-    vocab_file_path = os.path.join(self.get_temp_dir(), 'vocab.txt')
-    _create_fake_vocab_file(vocab_file_path)
-
-    data_config = loader.SentencePredictionTextDataConfig(
-        input_path='' if use_tfds else tf_record_path,
-        tfds_name='glue/mrpc' if use_tfds else '',
-        tfds_split='train' if use_tfds else '',
-        text_fields=text_fields,
-        global_batch_size=batch_size,
-        seq_length=seq_length,
-        is_training=True,
-        lower_case=lower_case,
-        vocab_file=vocab_file_path)
-    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
-    features, labels = next(iter(dataset))
-    self.assertCountEqual(['input_word_ids', 'input_type_ids', 'input_mask'],
-                          features.keys())
-    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(labels.shape, (batch_size,))
-
-  @parameterized.parameters(True, False)
-  def test_python_sentencepiece_preprocessing(self, use_tfds):
-    batch_size = 10
-    seq_length = 256  # Non-default value.
-    lower_case = True
-
-    tf_record_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
-    text_fields = ['sentence1', 'sentence2']
-    if not use_tfds:
-      _create_fake_raw_dataset(tf_record_path, text_fields, label_type='int')
-
-    sp_model_file_path = _create_fake_sentencepiece_model(self.get_temp_dir())
-    data_config = loader.SentencePredictionTextDataConfig(
-        input_path='' if use_tfds else tf_record_path,
-        tfds_name='glue/mrpc' if use_tfds else '',
-        tfds_split='train' if use_tfds else '',
-        text_fields=text_fields,
-        global_batch_size=batch_size,
-        seq_length=seq_length,
-        is_training=True,
-        lower_case=lower_case,
-        tokenization='SentencePiece',
-        vocab_file=sp_model_file_path,
-    )
-    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
-    features, labels = next(iter(dataset))
-    self.assertCountEqual(['input_word_ids', 'input_type_ids', 'input_mask'],
-                          features.keys())
-    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(labels.shape, (batch_size,))
-
-  @parameterized.parameters(True, False)
-  def test_saved_model_preprocessing(self, use_tfds):
-    batch_size = 10
-    seq_length = 256  # Non-default value.
-
-    tf_record_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
-    text_fields = ['sentence1', 'sentence2']
-    if not use_tfds:
-      _create_fake_raw_dataset(tf_record_path, text_fields, label_type='float')
-
-    vocab_file_path = os.path.join(self.get_temp_dir(), 'vocab.txt')
-    _create_fake_vocab_file(vocab_file_path)
-    data_config = loader.SentencePredictionTextDataConfig(
-        input_path='' if use_tfds else tf_record_path,
-        tfds_name='glue/mrpc' if use_tfds else '',
-        tfds_split='train' if use_tfds else '',
-        text_fields=text_fields,
-        global_batch_size=batch_size,
-        seq_length=seq_length,
-        is_training=True,
-        preprocessing_hub_module_url=(
-            'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'),
-        label_type='int' if use_tfds else 'float',
-    )
-    dataset = loader.SentencePredictionTextDataLoader(data_config).load()
-    features, labels = next(iter(dataset))
-    self.assertCountEqual(['input_word_ids', 'input_type_ids', 'input_mask'],
-                          features.keys())
-    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(labels.shape, (batch_size,))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/sentence_retrieval_lib.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/sentence_retrieval_lib.py
deleted file mode 100644
index 9971d6e2ae6f3d13192efb15b1e5b289cba68293..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/sentence_retrieval_lib.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""BERT library to process data for cross lingual sentence retrieval task."""
-
-import os
-
-from absl import logging
-from official.nlp.bert import tokenization
-from official.nlp.data import classifier_data_lib
-
-
-class BuccProcessor(classifier_data_lib.DataProcessor):
-  """Procssor for Xtreme BUCC data set."""
-  supported_languages = ["de", "fr", "ru", "zh"]
-
-  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
-    super(BuccProcessor, self).__init__(process_text_fn)
-    self.languages = BuccProcessor.supported_languages
-
-  def get_dev_examples(self, data_dir, file_pattern):
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, file_pattern.format("dev"))),
-        "sample")
-
-  def get_test_examples(self, data_dir, file_pattern):
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, file_pattern.format("test"))),
-        "test")
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "BUCC"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training and dev sets."""
-    examples = []
-    for (i, line) in enumerate(lines):
-      guid = "%s-%s" % (set_type, i)
-      example_id = int(line[0].split("-")[1])
-      text_a = self.process_text_fn(line[1])
-      examples.append(
-          classifier_data_lib.InputExample(
-              guid=guid, text_a=text_a, example_id=example_id))
-    return examples
-
-
-class TatoebaProcessor(classifier_data_lib.DataProcessor):
-  """Procssor for Xtreme Tatoeba data set."""
-  supported_languages = [
-      "af", "ar", "bg", "bn", "de", "el", "es", "et", "eu", "fa", "fi", "fr",
-      "he", "hi", "hu", "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr",
-      "nl", "pt", "ru", "sw", "ta", "te", "th", "tl", "tr", "ur", "vi", "zh"
-  ]
-
-  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
-    super(TatoebaProcessor, self).__init__(process_text_fn)
-    self.languages = TatoebaProcessor.supported_languages
-
-  def get_test_examples(self, data_dir, file_path):
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, file_path)), "test")
-
-  @staticmethod
-  def get_processor_name():
-    """See base class."""
-    return "TATOEBA"
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training and dev sets."""
-    examples = []
-    for (i, line) in enumerate(lines):
-      guid = "%s-%s" % (set_type, i)
-      text_a = self.process_text_fn(line[0])
-      examples.append(
-          classifier_data_lib.InputExample(
-              guid=guid, text_a=text_a, example_id=i))
-    return examples
-
-
-def generate_sentence_retrevial_tf_record(processor,
-                                          data_dir,
-                                          tokenizer,
-                                          eval_data_output_path=None,
-                                          test_data_output_path=None,
-                                          max_seq_length=128):
-  """Generates the tf records for retrieval tasks.
-
-  Args:
-    processor: Input processor object to be used for generating data. Subclass
-      of `DataProcessor`.
-      data_dir: Directory that contains train/eval data to process. Data files
-        should be in from.
-      tokenizer: The tokenizer to be applied on the data.
-      eval_data_output_path: Output to which processed tf record for evaluation
-        will be saved.
-      test_data_output_path: Output to which processed tf record for testing
-        will be saved. Must be a pattern template with {} if processor has
-        language specific test data.
-      max_seq_length: Maximum sequence length of the to be generated
-        training/eval data.
-
-  Returns:
-      A dictionary containing input meta data.
-  """
-  assert eval_data_output_path or test_data_output_path
-
-  if processor.get_processor_name() == "BUCC":
-    path_pattern = "{}-en.{{}}.{}"
-
-  if processor.get_processor_name() == "TATOEBA":
-    path_pattern = "{}-en.{}"
-
-  meta_data = {
-      "processor_type": processor.get_processor_name(),
-      "max_seq_length": max_seq_length,
-      "number_eval_data": {},
-      "number_test_data": {},
-  }
-  logging.info("Start to process %s task data", processor.get_processor_name())
-
-  for lang_a in processor.languages:
-    for lang_b in [lang_a, "en"]:
-      if eval_data_output_path:
-        eval_input_data_examples = processor.get_dev_examples(
-            data_dir, os.path.join(path_pattern.format(lang_a, lang_b)))
-
-        num_eval_data = len(eval_input_data_examples)
-        logging.info("Processing %d dev examples of %s-en.%s", num_eval_data,
-                     lang_a, lang_b)
-        output_file = os.path.join(
-            eval_data_output_path,
-            "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "dev"))
-        classifier_data_lib.file_based_convert_examples_to_features(
-            eval_input_data_examples, None, max_seq_length, tokenizer,
-            output_file, None)
-        meta_data["number_eval_data"][f"{lang_a}-en.{lang_b}"] = num_eval_data
-
-      if test_data_output_path:
-        test_input_data_examples = processor.get_test_examples(
-            data_dir, os.path.join(path_pattern.format(lang_a, lang_b)))
-
-        num_test_data = len(test_input_data_examples)
-        logging.info("Processing %d test examples of %s-en.%s", num_test_data,
-                     lang_a, lang_b)
-        output_file = os.path.join(
-            test_data_output_path,
-            "{}-en-{}.{}.tfrecords".format(lang_a, lang_b, "test"))
-        classifier_data_lib.file_based_convert_examples_to_features(
-            test_input_data_examples, None, max_seq_length, tokenizer,
-            output_file, None)
-        meta_data["number_test_data"][f"{lang_a}-en.{lang_b}"] = num_test_data
-
-  return meta_data
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/squad_lib.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/squad_lib.py
deleted file mode 100644
index 407aeaac4f4df3711b1283b76c79cccb99d27da9..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/squad_lib.py
+++ /dev/null
@@ -1,991 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Library to process data for SQuAD 1.1 and SQuAD 2.0."""
-# pylint: disable=g-bad-import-order
-import collections
-import copy
-import json
-import math
-import os
-
-import six
-
-from absl import logging
-import tensorflow as tf
-
-from official.nlp.bert import tokenization
-
-
-class SquadExample(object):
-  """A single training/test example for simple sequence classification.
-
-  For examples without an answer, the start and end position are -1.
-
-  Attributes:
-    qas_id: ID of the question-answer pair.
-    question_text: Original text for the question.
-    doc_tokens: The list of tokens in the context obtained by splitting on
-      whitespace only.
-    orig_answer_text: Original text for the answer.
-    start_position: Starting index of the answer in `doc_tokens`.
-    end_position: Ending index of the answer in `doc_tokens`.
-    is_impossible: Whether the question is impossible to answer given the
-      context. Only used in SQuAD 2.0.
-  """
-
-  def __init__(self,
-               qas_id,
-               question_text,
-               doc_tokens,
-               orig_answer_text=None,
-               start_position=None,
-               end_position=None,
-               is_impossible=False):
-    self.qas_id = qas_id
-    self.question_text = question_text
-    self.doc_tokens = doc_tokens
-    self.orig_answer_text = orig_answer_text
-    self.start_position = start_position
-    self.end_position = end_position
-    self.is_impossible = is_impossible
-
-  def __str__(self):
-    return self.__repr__()
-
-  def __repr__(self):
-    s = ""
-    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
-    s += ", question_text: %s" % (
-        tokenization.printable_text(self.question_text))
-    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
-    if self.start_position:
-      s += ", start_position: %d" % (self.start_position)
-    if self.start_position:
-      s += ", end_position: %d" % (self.end_position)
-    if self.start_position:
-      s += ", is_impossible: %r" % (self.is_impossible)
-    return s
-
-
-class InputFeatures(object):
-  """A single set of features of data."""
-
-  def __init__(self,
-               unique_id,
-               example_index,
-               doc_span_index,
-               tokens,
-               token_to_orig_map,
-               token_is_max_context,
-               input_ids,
-               input_mask,
-               segment_ids,
-               paragraph_mask=None,
-               class_index=None,
-               start_position=None,
-               end_position=None,
-               is_impossible=None):
-    self.unique_id = unique_id
-    self.example_index = example_index
-    self.doc_span_index = doc_span_index
-    self.tokens = tokens
-    self.token_to_orig_map = token_to_orig_map
-    self.token_is_max_context = token_is_max_context
-    self.input_ids = input_ids
-    self.input_mask = input_mask
-    self.segment_ids = segment_ids
-    self.start_position = start_position
-    self.end_position = end_position
-    self.is_impossible = is_impossible
-    self.paragraph_mask = paragraph_mask
-    self.class_index = class_index
-
-
-class FeatureWriter(object):
-  """Writes InputFeature to TF example file."""
-
-  def __init__(self, filename, is_training):
-    self.filename = filename
-    self.is_training = is_training
-    self.num_features = 0
-    tf.io.gfile.makedirs(os.path.dirname(filename))
-    self._writer = tf.io.TFRecordWriter(filename)
-
-  def process_feature(self, feature):
-    """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
-    self.num_features += 1
-
-    def create_int_feature(values):
-      feature = tf.train.Feature(
-          int64_list=tf.train.Int64List(value=list(values)))
-      return feature
-
-    features = collections.OrderedDict()
-    features["unique_ids"] = create_int_feature([feature.unique_id])
-    features["input_ids"] = create_int_feature(feature.input_ids)
-    features["input_mask"] = create_int_feature(feature.input_mask)
-    features["segment_ids"] = create_int_feature(feature.segment_ids)
-
-    if feature.paragraph_mask is not None:
-      features["paragraph_mask"] = create_int_feature(feature.paragraph_mask)
-    if feature.class_index is not None:
-      features["class_index"] = create_int_feature([feature.class_index])
-
-    if self.is_training:
-      features["start_positions"] = create_int_feature([feature.start_position])
-      features["end_positions"] = create_int_feature([feature.end_position])
-      impossible = 0
-      if feature.is_impossible:
-        impossible = 1
-      features["is_impossible"] = create_int_feature([impossible])
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    self._writer.write(tf_example.SerializeToString())
-
-  def close(self):
-    self._writer.close()
-
-
-def read_squad_examples(input_file, is_training,
-                        version_2_with_negative,
-                        translated_input_folder=None):
-  """Read a SQuAD json file into a list of SquadExample."""
-  with tf.io.gfile.GFile(input_file, "r") as reader:
-    input_data = json.load(reader)["data"]
-
-  if translated_input_folder is not None:
-    translated_files = tf.io.gfile.glob(
-        os.path.join(translated_input_folder, "*.json"))
-    for file in translated_files:
-      with tf.io.gfile.GFile(file, "r") as reader:
-        input_data.extend(json.load(reader)["data"])
-
-  def is_whitespace(c):
-    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-      return True
-    return False
-
-  examples = []
-  for entry in input_data:
-    for paragraph in entry["paragraphs"]:
-      paragraph_text = paragraph["context"]
-      doc_tokens = []
-      char_to_word_offset = []
-      prev_is_whitespace = True
-      for c in paragraph_text:
-        if is_whitespace(c):
-          prev_is_whitespace = True
-        else:
-          if prev_is_whitespace:
-            doc_tokens.append(c)
-          else:
-            doc_tokens[-1] += c
-          prev_is_whitespace = False
-        char_to_word_offset.append(len(doc_tokens) - 1)
-
-      for qa in paragraph["qas"]:
-        qas_id = qa["id"]
-        question_text = qa["question"]
-        start_position = None
-        end_position = None
-        orig_answer_text = None
-        is_impossible = False
-        if is_training:
-
-          if version_2_with_negative:
-            is_impossible = qa["is_impossible"]
-          if (len(qa["answers"]) != 1) and (not is_impossible):
-            raise ValueError(
-                "For training, each question should have exactly 1 answer.")
-          if not is_impossible:
-            answer = qa["answers"][0]
-            orig_answer_text = answer["text"]
-            answer_offset = answer["answer_start"]
-            answer_length = len(orig_answer_text)
-            start_position = char_to_word_offset[answer_offset]
-            end_position = char_to_word_offset[answer_offset + answer_length -
-                                               1]
-            # Only add answers where the text can be exactly recovered from the
-            # document. If this CAN'T happen it's likely due to weird Unicode
-            # stuff so we will just skip the example.
-            #
-            # Note that this means for training mode, every example is NOT
-            # guaranteed to be preserved.
-            actual_text = " ".join(doc_tokens[start_position:(end_position +
-                                                              1)])
-            cleaned_answer_text = " ".join(
-                tokenization.whitespace_tokenize(orig_answer_text))
-            if actual_text.find(cleaned_answer_text) == -1:
-              logging.warning("Could not find answer: '%s' vs. '%s'",
-                              actual_text, cleaned_answer_text)
-              continue
-          else:
-            start_position = -1
-            end_position = -1
-            orig_answer_text = ""
-
-        example = SquadExample(
-            qas_id=qas_id,
-            question_text=question_text,
-            doc_tokens=doc_tokens,
-            orig_answer_text=orig_answer_text,
-            start_position=start_position,
-            end_position=end_position,
-            is_impossible=is_impossible)
-        examples.append(example)
-
-  return examples
-
-
-def convert_examples_to_features(examples,
-                                 tokenizer,
-                                 max_seq_length,
-                                 doc_stride,
-                                 max_query_length,
-                                 is_training,
-                                 output_fn,
-                                 xlnet_format=False,
-                                 batch_size=None):
-  """Loads a data file into a list of `InputBatch`s."""
-
-  base_id = 1000000000
-  unique_id = base_id
-  feature = None
-  for (example_index, example) in enumerate(examples):
-    query_tokens = tokenizer.tokenize(example.question_text)
-
-    if len(query_tokens) > max_query_length:
-      query_tokens = query_tokens[0:max_query_length]
-
-    tok_to_orig_index = []
-    orig_to_tok_index = []
-    all_doc_tokens = []
-    for (i, token) in enumerate(example.doc_tokens):
-      orig_to_tok_index.append(len(all_doc_tokens))
-      sub_tokens = tokenizer.tokenize(token)
-      for sub_token in sub_tokens:
-        tok_to_orig_index.append(i)
-        all_doc_tokens.append(sub_token)
-
-    tok_start_position = None
-    tok_end_position = None
-    if is_training and example.is_impossible:
-      tok_start_position = -1
-      tok_end_position = -1
-    if is_training and not example.is_impossible:
-      tok_start_position = orig_to_tok_index[example.start_position]
-      if example.end_position < len(example.doc_tokens) - 1:
-        tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-      else:
-        tok_end_position = len(all_doc_tokens) - 1
-      (tok_start_position, tok_end_position) = _improve_answer_span(
-          all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-          example.orig_answer_text)
-
-    # The -3 accounts for [CLS], [SEP] and [SEP]
-    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-
-    # We can have documents that are longer than the maximum sequence length.
-    # To deal with this we do a sliding window approach, where we take chunks
-    # of the up to our max length with a stride of `doc_stride`.
-    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-        "DocSpan", ["start", "length"])
-    doc_spans = []
-    start_offset = 0
-    while start_offset < len(all_doc_tokens):
-      length = len(all_doc_tokens) - start_offset
-      if length > max_tokens_for_doc:
-        length = max_tokens_for_doc
-      doc_spans.append(_DocSpan(start=start_offset, length=length))
-      if start_offset + length == len(all_doc_tokens):
-        break
-      start_offset += min(length, doc_stride)
-
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
-      tokens = []
-      token_to_orig_map = {}
-      token_is_max_context = {}
-      segment_ids = []
-
-      # Paragraph mask used in XLNet.
-      # 1 represents paragraph and class tokens.
-      # 0 represents query and other special tokens.
-      paragraph_mask = []
-
-      # pylint: disable=cell-var-from-loop
-      def process_query(seg_q):
-        for token in query_tokens:
-          tokens.append(token)
-          segment_ids.append(seg_q)
-          paragraph_mask.append(0)
-        tokens.append("[SEP]")
-        segment_ids.append(seg_q)
-        paragraph_mask.append(0)
-
-      def process_paragraph(seg_p):
-        for i in range(doc_span.length):
-          split_token_index = doc_span.start + i
-          token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
-
-          is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                                 split_token_index)
-          token_is_max_context[len(tokens)] = is_max_context
-          tokens.append(all_doc_tokens[split_token_index])
-          segment_ids.append(seg_p)
-          paragraph_mask.append(1)
-        tokens.append("[SEP]")
-        segment_ids.append(seg_p)
-        paragraph_mask.append(0)
-
-      def process_class(seg_class):
-        class_index = len(segment_ids)
-        tokens.append("[CLS]")
-        segment_ids.append(seg_class)
-        paragraph_mask.append(1)
-        return class_index
-
-      if xlnet_format:
-        seg_p, seg_q, seg_class, seg_pad = 0, 1, 2, 3
-        process_paragraph(seg_p)
-        process_query(seg_q)
-        class_index = process_class(seg_class)
-      else:
-        seg_p, seg_q, seg_class, seg_pad = 1, 0, 0, 0
-        class_index = process_class(seg_class)
-        process_query(seg_q)
-        process_paragraph(seg_p)
-
-      input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-      # The mask has 1 for real tokens and 0 for padding tokens. Only real
-      # tokens are attended to.
-      input_mask = [1] * len(input_ids)
-
-      # Zero-pad up to the sequence length.
-      while len(input_ids) < max_seq_length:
-        input_ids.append(0)
-        input_mask.append(0)
-        segment_ids.append(seg_pad)
-        paragraph_mask.append(0)
-
-      assert len(input_ids) == max_seq_length
-      assert len(input_mask) == max_seq_length
-      assert len(segment_ids) == max_seq_length
-      assert len(paragraph_mask) == max_seq_length
-
-      start_position = 0
-      end_position = 0
-      span_contains_answer = False
-
-      if is_training and not example.is_impossible:
-        # For training, if our document chunk does not contain an annotation
-        # we throw it out, since there is nothing to predict.
-        doc_start = doc_span.start
-        doc_end = doc_span.start + doc_span.length - 1
-        span_contains_answer = (tok_start_position >= doc_start and
-                                tok_end_position <= doc_end)
-        if span_contains_answer:
-          doc_offset = 0 if xlnet_format else len(query_tokens) + 2
-          start_position = tok_start_position - doc_start + doc_offset
-          end_position = tok_end_position - doc_start + doc_offset
-
-      if example_index < 20:
-        logging.info("*** Example ***")
-        logging.info("unique_id: %s", (unique_id))
-        logging.info("example_index: %s", (example_index))
-        logging.info("doc_span_index: %s", (doc_span_index))
-        logging.info("tokens: %s",
-                     " ".join([tokenization.printable_text(x) for x in tokens]))
-        logging.info(
-            "token_to_orig_map: %s", " ".join([
-                "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)
-            ]))
-        logging.info(
-            "token_is_max_context: %s", " ".join([
-                "%d:%s" % (x, y)
-                for (x, y) in six.iteritems(token_is_max_context)
-            ]))
-        logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
-        logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
-        logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
-        logging.info("paragraph_mask: %s", " ".join(
-            [str(x) for x in paragraph_mask]))
-        logging.info("class_index: %d", class_index)
-        if is_training:
-          if span_contains_answer:
-            answer_text = " ".join(tokens[start_position:(end_position + 1)])
-            logging.info("start_position: %d", (start_position))
-            logging.info("end_position: %d", (end_position))
-            logging.info("answer: %s", tokenization.printable_text(answer_text))
-          else:
-            logging.info("document span doesn't contain answer")
-
-      feature = InputFeatures(
-          unique_id=unique_id,
-          example_index=example_index,
-          doc_span_index=doc_span_index,
-          tokens=tokens,
-          paragraph_mask=paragraph_mask,
-          class_index=class_index,
-          token_to_orig_map=token_to_orig_map,
-          token_is_max_context=token_is_max_context,
-          input_ids=input_ids,
-          input_mask=input_mask,
-          segment_ids=segment_ids,
-          start_position=start_position,
-          end_position=end_position,
-          is_impossible=not span_contains_answer)
-
-      # Run callback
-      if is_training:
-        output_fn(feature)
-      else:
-        output_fn(feature, is_padding=False)
-
-      unique_id += 1
-
-  if not is_training and feature:
-    assert batch_size
-    num_padding = 0
-    num_examples = unique_id - base_id
-    if unique_id % batch_size != 0:
-      num_padding = batch_size - (num_examples % batch_size)
-    logging.info("Adding padding examples to make sure no partial batch.")
-    logging.info("Adds %d padding examples for inference.", num_padding)
-    dummy_feature = copy.deepcopy(feature)
-    for _ in range(num_padding):
-      dummy_feature.unique_id = unique_id
-
-      # Run callback
-      output_fn(feature, is_padding=True)
-      unique_id += 1
-  return unique_id - base_id
-
-
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
-  """Returns tokenized answer spans that better match the annotated answer."""
-
-  # The SQuAD annotations are character based. We first project them to
-  # whitespace-tokenized words. But then after WordPiece tokenization, we can
-  # often find a "better match". For example:
-  #
-  #   Question: What year was John Smith born?
-  #   Context: The leader was John Smith (1895-1943).
-  #   Answer: 1895
-  #
-  # The original whitespace-tokenized answer will be "(1895-1943).". However
-  # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
-  # the exact answer, 1895.
-  #
-  # However, this is not always possible. Consider the following:
-  #
-  #   Question: What country is the top exporter of electornics?
-  #   Context: The Japanese electronics industry is the lagest in the world.
-  #   Answer: Japan
-  #
-  # In this case, the annotator chose "Japan" as a character sub-span of
-  # the word "Japanese". Since our WordPiece tokenizer does not split
-  # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
-  # in SQuAD, but does happen.
-  tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
-
-  for new_start in range(input_start, input_end + 1):
-    for new_end in range(input_end, new_start - 1, -1):
-      text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
-      if text_span == tok_answer_text:
-        return (new_start, new_end)
-
-  return (input_start, input_end)
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-  """Check if this is the 'max context' doc span for the token."""
-
-  # Because of the sliding window approach taken to scoring documents, a single
-  # token can appear in multiple documents. E.g.
-  #  Doc: the man went to the store and bought a gallon of milk
-  #  Span A: the man went to the
-  #  Span B: to the store and bought
-  #  Span C: and bought a gallon of
-  #  ...
-  #
-  # Now the word 'bought' will have two scores from spans B and C. We only
-  # want to consider the score with "maximum context", which we define as
-  # the *minimum* of its left and right context (the *sum* of left and
-  # right context will always be the same, of course).
-  #
-  # In the example the maximum context for 'bought' would be span C since
-  # it has 1 left context and 3 right context, while span B has 4 left context
-  # and 0 right context.
-  best_score = None
-  best_span_index = None
-  for (span_index, doc_span) in enumerate(doc_spans):
-    end = doc_span.start + doc_span.length - 1
-    if position < doc_span.start:
-      continue
-    if position > end:
-      continue
-    num_left_context = position - doc_span.start
-    num_right_context = end - position
-    score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-    if best_score is None or score > best_score:
-      best_score = score
-      best_span_index = span_index
-
-  return cur_span_index == best_span_index
-
-
-def write_predictions(all_examples,
-                      all_features,
-                      all_results,
-                      n_best_size,
-                      max_answer_length,
-                      do_lower_case,
-                      output_prediction_file,
-                      output_nbest_file,
-                      output_null_log_odds_file,
-                      version_2_with_negative=False,
-                      null_score_diff_threshold=0.0,
-                      verbose=False):
-  """Write final predictions to the json file and log-odds of null if needed."""
-  logging.info("Writing predictions to: %s", (output_prediction_file))
-  logging.info("Writing nbest to: %s", (output_nbest_file))
-
-  all_predictions, all_nbest_json, scores_diff_json = (
-      postprocess_output(
-          all_examples=all_examples,
-          all_features=all_features,
-          all_results=all_results,
-          n_best_size=n_best_size,
-          max_answer_length=max_answer_length,
-          do_lower_case=do_lower_case,
-          version_2_with_negative=version_2_with_negative,
-          null_score_diff_threshold=null_score_diff_threshold,
-          verbose=verbose))
-
-  write_to_json_files(all_predictions, output_prediction_file)
-  write_to_json_files(all_nbest_json, output_nbest_file)
-  if version_2_with_negative:
-    write_to_json_files(scores_diff_json, output_null_log_odds_file)
-
-
-def postprocess_output(all_examples,
-                       all_features,
-                       all_results,
-                       n_best_size,
-                       max_answer_length,
-                       do_lower_case,
-                       version_2_with_negative=False,
-                       null_score_diff_threshold=0.0,
-                       xlnet_format=False,
-                       verbose=False):
-  """Postprocess model output, to form predicton results."""
-
-  example_index_to_features = collections.defaultdict(list)
-  for feature in all_features:
-    example_index_to_features[feature.example_index].append(feature)
-  unique_id_to_result = {}
-  for result in all_results:
-    unique_id_to_result[result.unique_id] = result
-
-  _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-      "PrelimPrediction",
-      ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
-
-  all_predictions = collections.OrderedDict()
-  all_nbest_json = collections.OrderedDict()
-  scores_diff_json = collections.OrderedDict()
-
-  for (example_index, example) in enumerate(all_examples):
-    features = example_index_to_features[example_index]
-
-    prelim_predictions = []
-    # keep track of the minimum score of null start+end of position 0
-    score_null = 1000000  # large and positive
-    min_null_feature_index = 0  # the paragraph slice with min mull score
-    null_start_logit = 0  # the start logit at the slice with min null score
-    null_end_logit = 0  # the end logit at the slice with min null score
-    for (feature_index, feature) in enumerate(features):
-      if feature.unique_id not in unique_id_to_result:
-        logging.info("Skip eval example %s, not in pred.", feature.unique_id)
-        continue
-      result = unique_id_to_result[feature.unique_id]
-
-      # if we could have irrelevant answers, get the min score of irrelevant
-      if version_2_with_negative:
-        if xlnet_format:
-          feature_null_score = result.class_logits
-        else:
-          feature_null_score = result.start_logits[0] + result.end_logits[0]
-        if feature_null_score < score_null:
-          score_null = feature_null_score
-          min_null_feature_index = feature_index
-          null_start_logit = result.start_logits[0]
-          null_end_logit = result.end_logits[0]
-      for (start_index, start_logit,
-           end_index, end_logit) in _get_best_indexes_and_logits(
-               result=result,
-               n_best_size=n_best_size,
-               xlnet_format=xlnet_format):
-        # We could hypothetically create invalid predictions, e.g., predict
-        # that the start of the span is in the question. We throw out all
-        # invalid predictions.
-        if start_index >= len(feature.tokens):
-          continue
-        if end_index >= len(feature.tokens):
-          continue
-        if start_index not in feature.token_to_orig_map:
-          continue
-        if end_index not in feature.token_to_orig_map:
-          continue
-        if not feature.token_is_max_context.get(start_index, False):
-          continue
-        if end_index < start_index:
-          continue
-        length = end_index - start_index + 1
-        if length > max_answer_length:
-          continue
-        prelim_predictions.append(
-            _PrelimPrediction(
-                feature_index=feature_index,
-                start_index=start_index,
-                end_index=end_index,
-                start_logit=start_logit,
-                end_logit=end_logit))
-
-    if version_2_with_negative and not xlnet_format:
-      prelim_predictions.append(
-          _PrelimPrediction(
-              feature_index=min_null_feature_index,
-              start_index=0,
-              end_index=0,
-              start_logit=null_start_logit,
-              end_logit=null_end_logit))
-    prelim_predictions = sorted(
-        prelim_predictions,
-        key=lambda x: (x.start_logit + x.end_logit),
-        reverse=True)
-
-    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_logit", "end_logit"])
-
-    seen_predictions = {}
-    nbest = []
-    for pred in prelim_predictions:
-      if len(nbest) >= n_best_size:
-        break
-      feature = features[pred.feature_index]
-      if pred.start_index > 0 or xlnet_format:  # this is a non-null prediction
-        tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
-        orig_doc_start = feature.token_to_orig_map[pred.start_index]
-        orig_doc_end = feature.token_to_orig_map[pred.end_index]
-        orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
-        tok_text = " ".join(tok_tokens)
-
-        # De-tokenize WordPieces that have been split off.
-        tok_text = tok_text.replace(" ##", "")
-        tok_text = tok_text.replace("##", "")
-
-        # Clean whitespace
-        tok_text = tok_text.strip()
-        tok_text = " ".join(tok_text.split())
-        orig_text = " ".join(orig_tokens)
-
-        final_text = get_final_text(
-            tok_text, orig_text, do_lower_case, verbose=verbose)
-        if final_text in seen_predictions:
-          continue
-
-        seen_predictions[final_text] = True
-      else:
-        final_text = ""
-        seen_predictions[final_text] = True
-
-      nbest.append(
-          _NbestPrediction(
-              text=final_text,
-              start_logit=pred.start_logit,
-              end_logit=pred.end_logit))
-
-    # if we didn't inlude the empty option in the n-best, inlcude it
-    if version_2_with_negative and not xlnet_format:
-      if "" not in seen_predictions:
-        nbest.append(
-            _NbestPrediction(
-                text="", start_logit=null_start_logit,
-                end_logit=null_end_logit))
-    # In very rare edge cases we could have no valid predictions. So we
-    # just create a nonce prediction in this case to avoid failure.
-    if not nbest:
-      nbest.append(
-          _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-    assert len(nbest) >= 1
-
-    total_scores = []
-    best_non_null_entry = None
-    for entry in nbest:
-      total_scores.append(entry.start_logit + entry.end_logit)
-      if not best_non_null_entry:
-        if entry.text:
-          best_non_null_entry = entry
-
-    probs = _compute_softmax(total_scores)
-
-    nbest_json = []
-    for (i, entry) in enumerate(nbest):
-      output = collections.OrderedDict()
-      output["text"] = entry.text
-      output["probability"] = probs[i]
-      output["start_logit"] = entry.start_logit
-      output["end_logit"] = entry.end_logit
-      nbest_json.append(output)
-
-    assert len(nbest_json) >= 1
-
-    if not version_2_with_negative:
-      all_predictions[example.qas_id] = nbest_json[0]["text"]
-    else:
-      # pytype: disable=attribute-error
-      # predict "" iff the null score - the score of best non-null > threshold
-      if best_non_null_entry is not None:
-        if xlnet_format:
-          score_diff = score_null
-          scores_diff_json[example.qas_id] = score_diff
-          all_predictions[example.qas_id] = best_non_null_entry.text
-        else:
-          score_diff = score_null - best_non_null_entry.start_logit - (
-              best_non_null_entry.end_logit)
-          scores_diff_json[example.qas_id] = score_diff
-          if score_diff > null_score_diff_threshold:
-            all_predictions[example.qas_id] = ""
-          else:
-            all_predictions[example.qas_id] = best_non_null_entry.text
-      else:
-        logging.warning("best_non_null_entry is None")
-        scores_diff_json[example.qas_id] = score_null
-        all_predictions[example.qas_id] = ""
-      # pytype: enable=attribute-error
-
-    all_nbest_json[example.qas_id] = nbest_json
-
-  return all_predictions, all_nbest_json, scores_diff_json
-
-
-def write_to_json_files(json_records, json_file):
-  with tf.io.gfile.GFile(json_file, "w") as writer:
-    writer.write(json.dumps(json_records, indent=4) + "\n")
-
-
-def get_final_text(pred_text, orig_text, do_lower_case, verbose=False):
-  """Project the tokenized prediction back to the original text."""
-
-  # When we created the data, we kept track of the alignment between original
-  # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-  # now `orig_text` contains the span of our original text corresponding to the
-  # span that we predicted.
-  #
-  # However, `orig_text` may contain extra characters that we don't want in
-  # our prediction.
-  #
-  # For example, let's say:
-  #   pred_text = steve smith
-  #   orig_text = Steve Smith's
-  #
-  # We don't want to return `orig_text` because it contains the extra "'s".
-  #
-  # We don't want to return `pred_text` because it's already been normalized
-  # (the SQuAD eval script also does punctuation stripping/lower casing but
-  # our tokenizer does additional normalization like stripping accent
-  # characters).
-  #
-  # What we really want to return is "Steve Smith".
-  #
-  # Therefore, we have to apply a semi-complicated alignment heruistic between
-  # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
-  # can fail in certain cases in which case we just return `orig_text`.
-
-  def _strip_spaces(text):
-    ns_chars = []
-    ns_to_s_map = collections.OrderedDict()
-    for (i, c) in enumerate(text):
-      if c == " ":
-        continue
-      ns_to_s_map[len(ns_chars)] = i
-      ns_chars.append(c)
-    ns_text = "".join(ns_chars)
-    return (ns_text, ns_to_s_map)
-
-  # We first tokenize `orig_text`, strip whitespace from the result
-  # and `pred_text`, and check if they are the same length. If they are
-  # NOT the same length, the heuristic has failed. If they are the same
-  # length, we assume the characters are one-to-one aligned.
-  tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
-
-  tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-  start_position = tok_text.find(pred_text)
-  if start_position == -1:
-    if verbose:
-      logging.info("Unable to find text: '%s' in '%s'", pred_text, orig_text)
-    return orig_text
-  end_position = start_position + len(pred_text) - 1
-
-  (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-  (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-  if len(orig_ns_text) != len(tok_ns_text):
-    if verbose:
-      logging.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                   orig_ns_text, tok_ns_text)
-    return orig_text
-
-  # We then project the characters in `pred_text` back to `orig_text` using
-  # the character-to-character alignment.
-  tok_s_to_ns_map = {}
-  for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
-    tok_s_to_ns_map[tok_index] = i
-
-  orig_start_position = None
-  if start_position in tok_s_to_ns_map:
-    ns_start_position = tok_s_to_ns_map[start_position]
-    if ns_start_position in orig_ns_to_s_map:
-      orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-  if orig_start_position is None:
-    if verbose:
-      logging.info("Couldn't map start position")
-    return orig_text
-
-  orig_end_position = None
-  if end_position in tok_s_to_ns_map:
-    ns_end_position = tok_s_to_ns_map[end_position]
-    if ns_end_position in orig_ns_to_s_map:
-      orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-  if orig_end_position is None:
-    if verbose:
-      logging.info("Couldn't map end position")
-    return orig_text
-
-  output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-  return output_text
-
-
-def _get_best_indexes_and_logits(result,
-                                 n_best_size,
-                                 xlnet_format=False):
-  """Generates the n-best indexes and logits from a list."""
-  if xlnet_format:
-    for i in range(n_best_size):
-      for j in range(n_best_size):
-        j_index = i * n_best_size + j
-        yield (result.start_indexes[i], result.start_logits[i],
-               result.end_indexes[j_index], result.end_logits[j_index])
-  else:
-    start_index_and_score = sorted(enumerate(result.start_logits),
-                                   key=lambda x: x[1], reverse=True)
-    end_index_and_score = sorted(enumerate(result.end_logits),
-                                 key=lambda x: x[1], reverse=True)
-    for i in range(len(start_index_and_score)):
-      if i >= n_best_size:
-        break
-      for j in range(len(end_index_and_score)):
-        if j >= n_best_size:
-          break
-        yield (start_index_and_score[i][0], start_index_and_score[i][1],
-               end_index_and_score[j][0], end_index_and_score[j][1])
-
-
-def _compute_softmax(scores):
-  """Compute softmax probability over raw logits."""
-  if not scores:
-    return []
-
-  max_score = None
-  for score in scores:
-    if max_score is None or score > max_score:
-      max_score = score
-
-  exp_scores = []
-  total_sum = 0.0
-  for score in scores:
-    x = math.exp(score - max_score)
-    exp_scores.append(x)
-    total_sum += x
-
-  probs = []
-  for score in exp_scores:
-    probs.append(score / total_sum)
-  return probs
-
-
-def generate_tf_record_from_json_file(input_file_path,
-                                      vocab_file_path,
-                                      output_path,
-                                      translated_input_folder=None,
-                                      max_seq_length=384,
-                                      do_lower_case=True,
-                                      max_query_length=64,
-                                      doc_stride=128,
-                                      version_2_with_negative=False,
-                                      xlnet_format=False):
-  """Generates and saves training data into a tf record file."""
-  train_examples = read_squad_examples(
-      input_file=input_file_path,
-      is_training=True,
-      version_2_with_negative=version_2_with_negative,
-      translated_input_folder=translated_input_folder)
-  tokenizer = tokenization.FullTokenizer(
-      vocab_file=vocab_file_path, do_lower_case=do_lower_case)
-  train_writer = FeatureWriter(filename=output_path, is_training=True)
-  number_of_examples = convert_examples_to_features(
-      examples=train_examples,
-      tokenizer=tokenizer,
-      max_seq_length=max_seq_length,
-      doc_stride=doc_stride,
-      max_query_length=max_query_length,
-      is_training=True,
-      output_fn=train_writer.process_feature,
-      xlnet_format=xlnet_format)
-  train_writer.close()
-
-  meta_data = {
-      "task_type": "bert_squad",
-      "train_data_size": number_of_examples,
-      "max_seq_length": max_seq_length,
-      "max_query_length": max_query_length,
-      "doc_stride": doc_stride,
-      "version_2_with_negative": version_2_with_negative,
-  }
-
-  return meta_data
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/squad_lib_sp.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/squad_lib_sp.py
deleted file mode 100644
index b999bbdfa3a5bd811fe7d8ebbd73d8153805f4c5..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/squad_lib_sp.py
+++ /dev/null
@@ -1,992 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Run ALBERT on SQuAD 1.1 and SQuAD 2.0 using sentence piece tokenization.
-
-The file is forked from:
-
-https://github.com/google-research/ALBERT/blob/master/run_squad_sp.py
-"""
-import collections
-import copy
-import json
-import math
-import os
-
-from absl import logging
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.bert import tokenization
-
-
-class SquadExample(object):
-  """A single training/test example for simple sequence classification.
-
-     For examples without an answer, the start and end position are -1.
-  """
-
-  def __init__(self,
-               qas_id,
-               question_text,
-               paragraph_text,
-               orig_answer_text=None,
-               start_position=None,
-               end_position=None,
-               is_impossible=False):
-    self.qas_id = qas_id
-    self.question_text = question_text
-    self.paragraph_text = paragraph_text
-    self.orig_answer_text = orig_answer_text
-    self.start_position = start_position
-    self.end_position = end_position
-    self.is_impossible = is_impossible
-
-  def __str__(self):
-    return self.__repr__()
-
-  def __repr__(self):
-    s = ""
-    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
-    s += ", question_text: %s" % (
-        tokenization.printable_text(self.question_text))
-    s += ", paragraph_text: [%s]" % (" ".join(self.paragraph_text))
-    if self.start_position:
-      s += ", start_position: %d" % (self.start_position)
-    if self.start_position:
-      s += ", end_position: %d" % (self.end_position)
-    if self.start_position:
-      s += ", is_impossible: %r" % (self.is_impossible)
-    return s
-
-
-class InputFeatures(object):
-  """A single set of features of data."""
-
-  def __init__(self,
-               unique_id,
-               example_index,
-               doc_span_index,
-               tok_start_to_orig_index,
-               tok_end_to_orig_index,
-               token_is_max_context,
-               tokens,
-               input_ids,
-               input_mask,
-               segment_ids,
-               paragraph_len,
-               class_index=None,
-               paragraph_mask=None,
-               start_position=None,
-               end_position=None,
-               is_impossible=None):
-    self.unique_id = unique_id
-    self.example_index = example_index
-    self.doc_span_index = doc_span_index
-    self.tok_start_to_orig_index = tok_start_to_orig_index
-    self.tok_end_to_orig_index = tok_end_to_orig_index
-    self.token_is_max_context = token_is_max_context
-    self.tokens = tokens
-    self.input_ids = input_ids
-    self.input_mask = input_mask
-    self.paragraph_mask = paragraph_mask
-    self.segment_ids = segment_ids
-    self.paragraph_len = paragraph_len
-    self.class_index = class_index
-    self.start_position = start_position
-    self.end_position = end_position
-    self.is_impossible = is_impossible
-
-
-def read_squad_examples(input_file,
-                        is_training,
-                        version_2_with_negative,
-                        translated_input_folder=None):
-  """Read a SQuAD json file into a list of SquadExample."""
-  del version_2_with_negative
-  with tf.io.gfile.GFile(input_file, "r") as reader:
-    input_data = json.load(reader)["data"]
-
-  if translated_input_folder is not None:
-    translated_files = tf.io.gfile.glob(
-        os.path.join(translated_input_folder, "*.json"))
-    for file in translated_files:
-      with tf.io.gfile.GFile(file, "r") as reader:
-        input_data.extend(json.load(reader)["data"])
-
-  examples = []
-  for entry in input_data:
-    for paragraph in entry["paragraphs"]:
-      paragraph_text = paragraph["context"]
-
-      for qa in paragraph["qas"]:
-        qas_id = qa["id"]
-        question_text = qa["question"]
-        start_position = None
-        orig_answer_text = None
-        is_impossible = False
-
-        if is_training:
-          is_impossible = qa.get("is_impossible", False)
-          if (len(qa["answers"]) != 1) and (not is_impossible):
-            raise ValueError(
-                "For training, each question should have exactly 1 answer.")
-          if not is_impossible:
-            answer = qa["answers"][0]
-            orig_answer_text = answer["text"]
-            start_position = answer["answer_start"]
-          else:
-            start_position = -1
-            orig_answer_text = ""
-
-        example = SquadExample(
-            qas_id=qas_id,
-            question_text=question_text,
-            paragraph_text=paragraph_text,
-            orig_answer_text=orig_answer_text,
-            start_position=start_position,
-            is_impossible=is_impossible)
-        examples.append(example)
-
-  return examples
-
-
-def _convert_index(index, pos, m=None, is_start=True):
-  """Converts index."""
-  if index[pos] is not None:
-    return index[pos]
-  n = len(index)
-  rear = pos
-  while rear < n - 1 and index[rear] is None:
-    rear += 1
-  front = pos
-  while front > 0 and index[front] is None:
-    front -= 1
-  assert index[front] is not None or index[rear] is not None
-  if index[front] is None:
-    if index[rear] >= 1:
-      if is_start:
-        return 0
-      else:
-        return index[rear] - 1
-    return index[rear]
-  if index[rear] is None:
-    if m is not None and index[front] < m - 1:
-      if is_start:
-        return index[front] + 1
-      else:
-        return m - 1
-    return index[front]
-  if is_start:
-    if index[rear] > index[front] + 1:
-      return index[front] + 1
-    else:
-      return index[rear]
-  else:
-    if index[rear] > index[front] + 1:
-      return index[rear] - 1
-    else:
-      return index[front]
-
-
-def convert_examples_to_features(examples,
-                                 tokenizer,
-                                 max_seq_length,
-                                 doc_stride,
-                                 max_query_length,
-                                 is_training,
-                                 output_fn,
-                                 do_lower_case,
-                                 xlnet_format=False,
-                                 batch_size=None):
-  """Loads a data file into a list of `InputBatch`s."""
-  cnt_pos, cnt_neg = 0, 0
-  base_id = 1000000000
-  unique_id = base_id
-  max_n, max_m = 1024, 1024
-  f = np.zeros((max_n, max_m), dtype=np.float32)
-
-  for (example_index, example) in enumerate(examples):
-
-    if example_index % 100 == 0:
-      logging.info("Converting %d/%d pos %d neg %d", example_index,
-                   len(examples), cnt_pos, cnt_neg)
-
-    query_tokens = tokenization.encode_ids(
-        tokenizer.sp_model,
-        tokenization.preprocess_text(
-            example.question_text, lower=do_lower_case))
-
-    if len(query_tokens) > max_query_length:
-      query_tokens = query_tokens[0:max_query_length]
-
-    paragraph_text = example.paragraph_text
-    para_tokens = tokenization.encode_pieces(
-        tokenizer.sp_model,
-        tokenization.preprocess_text(
-            example.paragraph_text, lower=do_lower_case))
-
-    chartok_to_tok_index = []
-    tok_start_to_chartok_index = []
-    tok_end_to_chartok_index = []
-    char_cnt = 0
-    for i, token in enumerate(para_tokens):
-      new_token = token.replace(tokenization.SPIECE_UNDERLINE, " ")
-      chartok_to_tok_index.extend([i] * len(new_token))
-      tok_start_to_chartok_index.append(char_cnt)
-      char_cnt += len(new_token)
-      tok_end_to_chartok_index.append(char_cnt - 1)
-
-    tok_cat_text = "".join(para_tokens).replace(tokenization.SPIECE_UNDERLINE,
-                                                " ")
-    n, m = len(paragraph_text), len(tok_cat_text)
-
-    if n > max_n or m > max_m:
-      max_n = max(n, max_n)
-      max_m = max(m, max_m)
-      f = np.zeros((max_n, max_m), dtype=np.float32)
-
-    g = {}
-
-    # pylint: disable=cell-var-from-loop
-    def _lcs_match(max_dist, n=n, m=m):
-      """Longest-common-substring algorithm."""
-      f.fill(0)
-      g.clear()
-
-      ### longest common sub sequence
-      # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j))
-      for i in range(n):
-
-        # unlike standard LCS, this is specifically optimized for the setting
-        # because the mismatch between sentence pieces and original text will
-        # be small
-        for j in range(i - max_dist, i + max_dist):
-          if j >= m or j < 0:
-            continue
-
-          if i > 0:
-            g[(i, j)] = 0
-            f[i, j] = f[i - 1, j]
-
-          if j > 0 and f[i, j - 1] > f[i, j]:
-            g[(i, j)] = 1
-            f[i, j] = f[i, j - 1]
-
-          f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0
-          if (tokenization.preprocess_text(
-              paragraph_text[i], lower=do_lower_case,
-              remove_space=False) == tok_cat_text[j] and f_prev + 1 > f[i, j]):
-            g[(i, j)] = 2
-            f[i, j] = f_prev + 1
-
-    # pylint: enable=cell-var-from-loop
-
-    max_dist = abs(n - m) + 5
-    for _ in range(2):
-      _lcs_match(max_dist)
-      if f[n - 1, m - 1] > 0.8 * n:
-        break
-      max_dist *= 2
-
-    orig_to_chartok_index = [None] * n
-    chartok_to_orig_index = [None] * m
-    i, j = n - 1, m - 1
-    while i >= 0 and j >= 0:
-      if (i, j) not in g:
-        break
-      if g[(i, j)] == 2:
-        orig_to_chartok_index[i] = j
-        chartok_to_orig_index[j] = i
-        i, j = i - 1, j - 1
-      elif g[(i, j)] == 1:
-        j = j - 1
-      else:
-        i = i - 1
-
-    if (all(v is None for v in orig_to_chartok_index) or
-        f[n - 1, m - 1] < 0.8 * n):
-      logging.info("MISMATCH DETECTED!")
-      continue
-
-    tok_start_to_orig_index = []
-    tok_end_to_orig_index = []
-    for i in range(len(para_tokens)):
-      start_chartok_pos = tok_start_to_chartok_index[i]
-      end_chartok_pos = tok_end_to_chartok_index[i]
-      start_orig_pos = _convert_index(
-          chartok_to_orig_index, start_chartok_pos, n, is_start=True)
-      end_orig_pos = _convert_index(
-          chartok_to_orig_index, end_chartok_pos, n, is_start=False)
-
-      tok_start_to_orig_index.append(start_orig_pos)
-      tok_end_to_orig_index.append(end_orig_pos)
-
-    if not is_training:
-      tok_start_position = tok_end_position = None
-
-    if is_training and example.is_impossible:
-      tok_start_position = 0
-      tok_end_position = 0
-
-    if is_training and not example.is_impossible:
-      start_position = example.start_position
-      end_position = start_position + len(example.orig_answer_text) - 1
-
-      start_chartok_pos = _convert_index(
-          orig_to_chartok_index, start_position, is_start=True)
-      tok_start_position = chartok_to_tok_index[start_chartok_pos]
-
-      end_chartok_pos = _convert_index(
-          orig_to_chartok_index, end_position, is_start=False)
-      tok_end_position = chartok_to_tok_index[end_chartok_pos]
-      assert tok_start_position <= tok_end_position
-
-    def _piece_to_id(x):
-      return tokenizer.sp_model.PieceToId(x)
-
-    all_doc_tokens = list(map(_piece_to_id, para_tokens))
-
-    # The -3 accounts for [CLS], [SEP] and [SEP]
-    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
-
-    # We can have documents that are longer than the maximum sequence length.
-    # To deal with this we do a sliding window approach, where we take chunks
-    # of the up to our max length with a stride of `doc_stride`.
-    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-        "DocSpan", ["start", "length"])
-    doc_spans = []
-    start_offset = 0
-
-    while start_offset < len(all_doc_tokens):
-      length = len(all_doc_tokens) - start_offset
-      if length > max_tokens_for_doc:
-        length = max_tokens_for_doc
-      doc_spans.append(_DocSpan(start=start_offset, length=length))
-      if start_offset + length == len(all_doc_tokens):
-        break
-      start_offset += min(length, doc_stride)
-
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
-      tokens = []
-      token_is_max_context = {}
-      segment_ids = []
-
-      # Paragraph mask used in XLNet.
-      # 1 represents paragraph and class tokens.
-      # 0 represents query and other special tokens.
-      paragraph_mask = []
-
-      cur_tok_start_to_orig_index = []
-      cur_tok_end_to_orig_index = []
-
-      # pylint: disable=cell-var-from-loop
-      def process_query(seg_q):
-        for token in query_tokens:
-          tokens.append(token)
-          segment_ids.append(seg_q)
-          paragraph_mask.append(0)
-        tokens.append(tokenizer.sp_model.PieceToId("[SEP]"))
-        segment_ids.append(seg_q)
-        paragraph_mask.append(0)
-
-      def process_paragraph(seg_p):
-        for i in range(doc_span.length):
-          split_token_index = doc_span.start + i
-
-          cur_tok_start_to_orig_index.append(
-              tok_start_to_orig_index[split_token_index])
-          cur_tok_end_to_orig_index.append(
-              tok_end_to_orig_index[split_token_index])
-
-          is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                                 split_token_index)
-          token_is_max_context[len(tokens)] = is_max_context
-          tokens.append(all_doc_tokens[split_token_index])
-          segment_ids.append(seg_p)
-          paragraph_mask.append(1)
-        tokens.append(tokenizer.sp_model.PieceToId("[SEP]"))
-        segment_ids.append(seg_p)
-        paragraph_mask.append(0)
-        return len(tokens)
-
-      def process_class(seg_class):
-        class_index = len(segment_ids)
-        tokens.append(tokenizer.sp_model.PieceToId("[CLS]"))
-        segment_ids.append(seg_class)
-        paragraph_mask.append(1)
-        return class_index
-
-      if xlnet_format:
-        seg_p, seg_q, seg_class, seg_pad = 0, 1, 2, 3
-        paragraph_len = process_paragraph(seg_p)
-        process_query(seg_q)
-        class_index = process_class(seg_class)
-      else:
-        seg_p, seg_q, seg_class, seg_pad = 1, 0, 0, 0
-        class_index = process_class(seg_class)
-        process_query(seg_q)
-        paragraph_len = process_paragraph(seg_p)
-
-      input_ids = tokens
-
-      # The mask has 1 for real tokens and 0 for padding tokens. Only real
-      # tokens are attended to.
-      input_mask = [1] * len(input_ids)
-
-      # Zero-pad up to the sequence length.
-      while len(input_ids) < max_seq_length:
-        input_ids.append(0)
-        input_mask.append(0)
-        segment_ids.append(seg_pad)
-        paragraph_mask.append(0)
-
-      assert len(input_ids) == max_seq_length
-      assert len(input_mask) == max_seq_length
-      assert len(segment_ids) == max_seq_length
-      assert len(paragraph_mask) == max_seq_length
-
-      span_is_impossible = example.is_impossible
-      start_position = None
-      end_position = None
-      if is_training and not span_is_impossible:
-        # For training, if our document chunk does not contain an annotation
-        # we throw it out, since there is nothing to predict.
-        doc_start = doc_span.start
-        doc_end = doc_span.start + doc_span.length - 1
-        out_of_span = False
-        if not (tok_start_position >= doc_start and
-                tok_end_position <= doc_end):
-          out_of_span = True
-        if out_of_span:
-          # continue
-          start_position = 0
-          end_position = 0
-          span_is_impossible = True
-        else:
-          doc_offset = 0 if xlnet_format else len(query_tokens) + 2
-          start_position = tok_start_position - doc_start + doc_offset
-          end_position = tok_end_position - doc_start + doc_offset
-
-      if is_training and span_is_impossible:
-        start_position = class_index
-        end_position = class_index
-
-      if example_index < 20:
-        logging.info("*** Example ***")
-        logging.info("unique_id: %s", (unique_id))
-        logging.info("example_index: %s", (example_index))
-        logging.info("doc_span_index: %s", (doc_span_index))
-        logging.info("tok_start_to_orig_index: %s",
-                     " ".join([str(x) for x in cur_tok_start_to_orig_index]))
-        logging.info("tok_end_to_orig_index: %s",
-                     " ".join([str(x) for x in cur_tok_end_to_orig_index]))
-        logging.info(
-            "token_is_max_context: %s", " ".join(
-                ["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()]))
-        logging.info(
-            "input_pieces: %s",
-            " ".join([tokenizer.sp_model.IdToPiece(x) for x in tokens]))
-        logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
-        logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
-        logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
-        logging.info("paragraph_mask: %s", " ".join(
-            [str(x) for x in paragraph_mask]))
-        logging.info("class_index: %d", class_index)
-
-        if is_training and span_is_impossible:
-          logging.info("impossible example span")
-
-        if is_training and not span_is_impossible:
-          pieces = [
-              tokenizer.sp_model.IdToPiece(token)
-              for token in tokens[start_position:(end_position + 1)]
-          ]
-          answer_text = tokenizer.sp_model.DecodePieces(pieces)
-          logging.info("start_position: %d", (start_position))
-          logging.info("end_position: %d", (end_position))
-          logging.info("answer: %s", (tokenization.printable_text(answer_text)))
-
-          # With multi processing, the example_index is actually the index
-          # within the current process therefore we use example_index=None
-          # to avoid being used in the future.
-          # The current code does not use example_index of training data.
-      if is_training:
-        feat_example_index = None
-      else:
-        feat_example_index = example_index
-
-      feature = InputFeatures(
-          unique_id=unique_id,
-          example_index=feat_example_index,
-          doc_span_index=doc_span_index,
-          tok_start_to_orig_index=cur_tok_start_to_orig_index,
-          tok_end_to_orig_index=cur_tok_end_to_orig_index,
-          token_is_max_context=token_is_max_context,
-          tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens],
-          input_ids=input_ids,
-          input_mask=input_mask,
-          paragraph_mask=paragraph_mask,
-          segment_ids=segment_ids,
-          paragraph_len=paragraph_len,
-          class_index=class_index,
-          start_position=start_position,
-          end_position=end_position,
-          is_impossible=span_is_impossible)
-
-      # Run callback
-      if is_training:
-        output_fn(feature)
-      else:
-        output_fn(feature, is_padding=False)
-
-      unique_id += 1
-      if span_is_impossible:
-        cnt_neg += 1
-      else:
-        cnt_pos += 1
-
-  if not is_training and feature:
-    assert batch_size
-    num_padding = 0
-    num_examples = unique_id - base_id
-    if unique_id % batch_size != 0:
-      num_padding = batch_size - (num_examples % batch_size)
-    dummy_feature = copy.deepcopy(feature)
-    for _ in range(num_padding):
-      dummy_feature.unique_id = unique_id
-
-      # Run callback
-      output_fn(feature, is_padding=True)
-      unique_id += 1
-
-  logging.info("Total number of instances: %d = pos %d neg %d",
-               cnt_pos + cnt_neg, cnt_pos, cnt_neg)
-  return unique_id - base_id
-
-
-def _check_is_max_context(doc_spans, cur_span_index, position):
-  """Check if this is the 'max context' doc span for the token."""
-
-  # Because of the sliding window approach taken to scoring documents, a single
-  # token can appear in multiple documents. E.g.
-  #  Doc: the man went to the store and bought a gallon of milk
-  #  Span A: the man went to the
-  #  Span B: to the store and bought
-  #  Span C: and bought a gallon of
-  #  ...
-  #
-  # Now the word 'bought' will have two scores from spans B and C. We only
-  # want to consider the score with "maximum context", which we define as
-  # the *minimum* of its left and right context (the *sum* of left and
-  # right context will always be the same, of course).
-  #
-  # In the example the maximum context for 'bought' would be span C since
-  # it has 1 left context and 3 right context, while span B has 4 left context
-  # and 0 right context.
-  best_score = None
-  best_span_index = None
-  for (span_index, doc_span) in enumerate(doc_spans):
-    end = doc_span.start + doc_span.length - 1
-    if position < doc_span.start:
-      continue
-    if position > end:
-      continue
-    num_left_context = position - doc_span.start
-    num_right_context = end - position
-    score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-    if best_score is None or score > best_score:
-      best_score = score
-      best_span_index = span_index
-
-  return cur_span_index == best_span_index
-
-
-def write_predictions(all_examples,
-                      all_features,
-                      all_results,
-                      n_best_size,
-                      max_answer_length,
-                      do_lower_case,
-                      output_prediction_file,
-                      output_nbest_file,
-                      output_null_log_odds_file,
-                      version_2_with_negative=False,
-                      null_score_diff_threshold=0.0,
-                      verbose=False):
-  """Write final predictions to the json file and log-odds of null if needed."""
-  logging.info("Writing predictions to: %s", (output_prediction_file))
-  logging.info("Writing nbest to: %s", (output_nbest_file))
-
-  all_predictions, all_nbest_json, scores_diff_json = (
-      postprocess_output(
-          all_examples=all_examples,
-          all_features=all_features,
-          all_results=all_results,
-          n_best_size=n_best_size,
-          max_answer_length=max_answer_length,
-          do_lower_case=do_lower_case,
-          version_2_with_negative=version_2_with_negative,
-          null_score_diff_threshold=null_score_diff_threshold,
-          verbose=verbose))
-
-  write_to_json_files(all_predictions, output_prediction_file)
-  write_to_json_files(all_nbest_json, output_nbest_file)
-  if version_2_with_negative:
-    write_to_json_files(scores_diff_json, output_null_log_odds_file)
-
-
-def postprocess_output(all_examples,
-                       all_features,
-                       all_results,
-                       n_best_size,
-                       max_answer_length,
-                       do_lower_case,
-                       version_2_with_negative=False,
-                       null_score_diff_threshold=0.0,
-                       xlnet_format=False,
-                       verbose=False):
-  """Postprocess model output, to form predicton results."""
-
-  del do_lower_case, verbose
-  example_index_to_features = collections.defaultdict(list)
-  for feature in all_features:
-    example_index_to_features[feature.example_index].append(feature)
-
-  unique_id_to_result = {}
-  for result in all_results:
-    unique_id_to_result[result.unique_id] = result
-
-  _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-      "PrelimPrediction",
-      ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
-
-  all_predictions = collections.OrderedDict()
-  all_nbest_json = collections.OrderedDict()
-  scores_diff_json = collections.OrderedDict()
-
-  for (example_index, example) in enumerate(all_examples):
-    features = example_index_to_features[example_index]
-
-    prelim_predictions = []
-    # keep track of the minimum score of null start+end of position 0
-    score_null = 1000000  # large and positive
-    min_null_feature_index = 0  # the paragraph slice with min mull score
-    null_start_logit = 0  # the start logit at the slice with min null score
-    null_end_logit = 0  # the end logit at the slice with min null score
-    for (feature_index, feature) in enumerate(features):
-      if feature.unique_id not in unique_id_to_result:
-        logging.info("Skip eval example %s, not in pred.", feature.unique_id)
-        continue
-      result = unique_id_to_result[feature.unique_id]
-
-      # if we could have irrelevant answers, get the min score of irrelevant
-      if version_2_with_negative:
-        if xlnet_format:
-          feature_null_score = result.class_logits
-        else:
-          feature_null_score = result.start_logits[0] + result.end_logits[0]
-        if feature_null_score < score_null:
-          score_null = feature_null_score
-          min_null_feature_index = feature_index
-          null_start_logit = result.start_logits[0]
-          null_end_logit = result.end_logits[0]
-
-      doc_offset = 0 if xlnet_format else feature.tokens.index("[SEP]") + 1
-
-      for (start_index, start_logit,
-           end_index, end_logit) in _get_best_indexes_and_logits(
-               result=result,
-               n_best_size=n_best_size,
-               xlnet_format=xlnet_format):
-        # We could hypothetically create invalid predictions, e.g., predict
-        # that the start of the span is in the question. We throw out all
-        # invalid predictions.
-        if start_index - doc_offset >= len(feature.tok_start_to_orig_index):
-          continue
-        if end_index - doc_offset >= len(feature.tok_end_to_orig_index):
-          continue
-        if not feature.token_is_max_context.get(start_index, False):
-          continue
-        if end_index < start_index:
-          continue
-        length = end_index - start_index + 1
-        if length > max_answer_length:
-          continue
-        prelim_predictions.append(
-            _PrelimPrediction(
-                feature_index=feature_index,
-                start_index=start_index - doc_offset,
-                end_index=end_index - doc_offset,
-                start_logit=start_logit,
-                end_logit=end_logit))
-
-    if version_2_with_negative and not xlnet_format:
-      prelim_predictions.append(
-          _PrelimPrediction(
-              feature_index=min_null_feature_index,
-              start_index=-1,
-              end_index=-1,
-              start_logit=null_start_logit,
-              end_logit=null_end_logit))
-    prelim_predictions = sorted(
-        prelim_predictions,
-        key=lambda x: (x.start_logit + x.end_logit),
-        reverse=True)
-
-    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_logit", "end_logit"])
-
-    seen_predictions = {}
-    nbest = []
-    for pred in prelim_predictions:
-      if len(nbest) >= n_best_size:
-        break
-      feature = features[pred.feature_index]
-      if pred.start_index >= 0 or xlnet_format:  # this is a non-null prediction
-        tok_start_to_orig_index = feature.tok_start_to_orig_index
-        tok_end_to_orig_index = feature.tok_end_to_orig_index
-        start_orig_pos = tok_start_to_orig_index[pred.start_index]
-        end_orig_pos = tok_end_to_orig_index[pred.end_index]
-
-        paragraph_text = example.paragraph_text
-        final_text = paragraph_text[start_orig_pos:end_orig_pos + 1].strip()
-        if final_text in seen_predictions:
-          continue
-
-        seen_predictions[final_text] = True
-      else:
-        final_text = ""
-        seen_predictions[final_text] = True
-
-      nbest.append(
-          _NbestPrediction(
-              text=final_text,
-              start_logit=pred.start_logit,
-              end_logit=pred.end_logit))
-
-    # if we didn't inlude the empty option in the n-best, include it
-    if version_2_with_negative and not xlnet_format:
-      if "" not in seen_predictions:
-        nbest.append(
-            _NbestPrediction(
-                text="", start_logit=null_start_logit,
-                end_logit=null_end_logit))
-    # In very rare edge cases we could have no valid predictions. So we
-    # just create a nonce prediction in this case to avoid failure.
-    if not nbest:
-      nbest.append(
-          _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-    assert len(nbest) >= 1
-
-    total_scores = []
-    best_non_null_entry = None
-    for entry in nbest:
-      total_scores.append(entry.start_logit + entry.end_logit)
-      if not best_non_null_entry:
-        if entry.text:
-          best_non_null_entry = entry
-
-    probs = _compute_softmax(total_scores)
-
-    nbest_json = []
-    for (i, entry) in enumerate(nbest):
-      output = collections.OrderedDict()
-      output["text"] = entry.text
-      output["probability"] = probs[i]
-      output["start_logit"] = entry.start_logit
-      output["end_logit"] = entry.end_logit
-      nbest_json.append(output)
-
-    assert len(nbest_json) >= 1
-
-    if not version_2_with_negative:
-      all_predictions[example.qas_id] = nbest_json[0]["text"]
-    else:
-      assert best_non_null_entry is not None
-      if xlnet_format:
-        score_diff = score_null
-        scores_diff_json[example.qas_id] = score_diff
-        all_predictions[example.qas_id] = best_non_null_entry.text
-      else:
-        # predict "" iff the null score - the score of best non-null > threshold
-        score_diff = score_null - best_non_null_entry.start_logit - (
-            best_non_null_entry.end_logit)
-        scores_diff_json[example.qas_id] = score_diff
-        if score_diff > null_score_diff_threshold:
-          all_predictions[example.qas_id] = ""
-        else:
-          all_predictions[example.qas_id] = best_non_null_entry.text
-
-    all_nbest_json[example.qas_id] = nbest_json
-
-  return all_predictions, all_nbest_json, scores_diff_json
-
-
-def write_to_json_files(json_records, json_file):
-  with tf.io.gfile.GFile(json_file, "w") as writer:
-    writer.write(json.dumps(json_records, indent=4) + "\n")
-
-
-def _get_best_indexes_and_logits(result,
-                                 n_best_size,
-                                 xlnet_format=False):
-  """Generates the n-best indexes and logits from a list."""
-  if xlnet_format:
-    for i in range(n_best_size):
-      for j in range(n_best_size):
-        j_index = i * n_best_size + j
-        yield (result.start_indexes[i], result.start_logits[i],
-               result.end_indexes[j_index], result.end_logits[j_index])
-  else:
-    start_index_and_score = sorted(enumerate(result.start_logits),
-                                   key=lambda x: x[1], reverse=True)
-    end_index_and_score = sorted(enumerate(result.end_logits),
-                                 key=lambda x: x[1], reverse=True)
-    for i in range(len(start_index_and_score)):
-      if i >= n_best_size:
-        break
-      for j in range(len(end_index_and_score)):
-        if j >= n_best_size:
-          break
-        yield (start_index_and_score[i][0], start_index_and_score[i][1],
-               end_index_and_score[j][0], end_index_and_score[j][1])
-
-
-def _compute_softmax(scores):
-  """Compute softmax probability over raw logits."""
-  if not scores:
-    return []
-
-  max_score = None
-  for score in scores:
-    if max_score is None or score > max_score:
-      max_score = score
-
-  exp_scores = []
-  total_sum = 0.0
-  for score in scores:
-    x = math.exp(score - max_score)
-    exp_scores.append(x)
-    total_sum += x
-
-  probs = []
-  for score in exp_scores:
-    probs.append(score / total_sum)
-  return probs
-
-
-class FeatureWriter(object):
-  """Writes InputFeature to TF example file."""
-
-  def __init__(self, filename, is_training):
-    self.filename = filename
-    self.is_training = is_training
-    self.num_features = 0
-    tf.io.gfile.makedirs(os.path.dirname(filename))
-    self._writer = tf.io.TFRecordWriter(filename)
-
-  def process_feature(self, feature):
-    """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
-    self.num_features += 1
-
-    def create_int_feature(values):
-      feature = tf.train.Feature(
-          int64_list=tf.train.Int64List(value=list(values)))
-      return feature
-
-    features = collections.OrderedDict()
-    features["unique_ids"] = create_int_feature([feature.unique_id])
-    features["input_ids"] = create_int_feature(feature.input_ids)
-    features["input_mask"] = create_int_feature(feature.input_mask)
-    features["segment_ids"] = create_int_feature(feature.segment_ids)
-    if feature.paragraph_mask is not None:
-      features["paragraph_mask"] = create_int_feature(feature.paragraph_mask)
-    if feature.class_index is not None:
-      features["class_index"] = create_int_feature([feature.class_index])
-
-    if self.is_training:
-      features["start_positions"] = create_int_feature([feature.start_position])
-      features["end_positions"] = create_int_feature([feature.end_position])
-      impossible = 0
-      if feature.is_impossible:
-        impossible = 1
-      features["is_impossible"] = create_int_feature([impossible])
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    self._writer.write(tf_example.SerializeToString())
-
-  def close(self):
-    self._writer.close()
-
-
-def generate_tf_record_from_json_file(input_file_path,
-                                      sp_model_file,
-                                      output_path,
-                                      translated_input_folder=None,
-                                      max_seq_length=384,
-                                      do_lower_case=True,
-                                      max_query_length=64,
-                                      doc_stride=128,
-                                      xlnet_format=False,
-                                      version_2_with_negative=False):
-  """Generates and saves training data into a tf record file."""
-  train_examples = read_squad_examples(
-      input_file=input_file_path,
-      is_training=True,
-      version_2_with_negative=version_2_with_negative,
-      translated_input_folder=translated_input_folder)
-  tokenizer = tokenization.FullSentencePieceTokenizer(
-      sp_model_file=sp_model_file)
-  train_writer = FeatureWriter(
-      filename=output_path, is_training=True)
-  number_of_examples = convert_examples_to_features(
-      examples=train_examples,
-      tokenizer=tokenizer,
-      max_seq_length=max_seq_length,
-      doc_stride=doc_stride,
-      max_query_length=max_query_length,
-      is_training=True,
-      output_fn=train_writer.process_feature,
-      xlnet_format=xlnet_format,
-      do_lower_case=do_lower_case)
-  train_writer.close()
-
-  meta_data = {
-      "task_type": "bert_squad",
-      "train_data_size": number_of_examples,
-      "max_seq_length": max_seq_length,
-      "max_query_length": max_query_length,
-      "doc_stride": doc_stride,
-      "version_2_with_negative": version_2_with_negative,
-  }
-
-  return meta_data
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_data_lib.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_data_lib.py
deleted file mode 100644
index 9550eadf2ce75265cd6c9512533b57c8b6432f1f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_data_lib.py
+++ /dev/null
@@ -1,442 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Library to process data for tagging task such as NER/POS."""
-import collections
-import os
-
-from absl import logging
-import tensorflow as tf
-
-from official.nlp.bert import tokenization
-from official.nlp.data import classifier_data_lib
-
-# A negative label id for the padding label, which will not contribute
-# to loss/metrics in training.
-_PADDING_LABEL_ID = -1
-
-# The special unknown token, used to substitute a word which has too many
-# subwords after tokenization.
-_UNK_TOKEN = "[UNK]"
-
-
-class InputExample(object):
-  """A single training/test example for token classification."""
-
-  def __init__(self,
-               sentence_id,
-               sub_sentence_id=0,
-               words=None,
-               label_ids=None):
-    """Constructs an InputExample."""
-    self.sentence_id = sentence_id
-    self.sub_sentence_id = sub_sentence_id
-    self.words = words if words else []
-    self.label_ids = label_ids if label_ids else []
-
-  def add_word_and_label_id(self, word, label_id):
-    """Adds word and label_id pair in the example."""
-    self.words.append(word)
-    self.label_ids.append(label_id)
-
-
-def _read_one_file(file_name, label_list):
-  """Reads one file and returns a list of `InputExample` instances."""
-  lines = tf.io.gfile.GFile(file_name, "r").readlines()
-  examples = []
-  label_id_map = {label: i for i, label in enumerate(label_list)}
-  sentence_id = 0
-  example = InputExample(sentence_id=0)
-  for line in lines:
-    line = line.strip("\n")
-    if line:
-      # The format is: <token>\t<label> for train/dev set and <token> for test.
-      items = line.split("\t")
-      assert len(items) == 2 or len(items) == 1
-      token = items[0].strip()
-
-      # Assign a dummy label_id for test set
-      label_id = label_id_map[items[1].strip()] if len(items) == 2 else 0
-      example.add_word_and_label_id(token, label_id)
-    else:
-      # Empty line indicates a new sentence.
-      if example.words:
-        examples.append(example)
-        sentence_id += 1
-        example = InputExample(sentence_id=sentence_id)
-
-  if example.words:
-    examples.append(example)
-  return examples
-
-
-class PanxProcessor(classifier_data_lib.DataProcessor):
-  """Processor for the Panx data set."""
-  supported_languages = [
-      "ar", "he", "vi", "id", "jv", "ms", "tl", "eu", "ml", "ta", "te", "af",
-      "nl", "en", "de", "el", "bn", "hi", "mr", "ur", "fa", "fr", "it", "pt",
-      "es", "bg", "ru", "ja", "ka", "ko", "th", "sw", "yo", "my", "zh", "kk",
-      "tr", "et", "fi", "hu"
-  ]
-
-  def __init__(self,
-               process_text_fn=tokenization.convert_to_unicode,
-               only_use_en_train=True,
-               only_use_en_dev=True):
-    """See base class.
-
-    Args:
-      process_text_fn: See base class.
-      only_use_en_train: If True, only use english training data. Otherwise, use
-        training data from all languages.
-      only_use_en_dev: If True, only use english dev data. Otherwise, use dev
-        data from all languages.
-    """
-    super(PanxProcessor, self).__init__(process_text_fn)
-    self.only_use_en_train = only_use_en_train
-    self.only_use_en_dev = only_use_en_dev
-
-  def get_train_examples(self, data_dir):
-    examples = _read_one_file(
-        os.path.join(data_dir, "train-en.tsv"), self.get_labels())
-    if not self.only_use_en_train:
-      for language in self.supported_languages:
-        if language == "en":
-          continue
-        examples.extend(
-            _read_one_file(
-                os.path.join(data_dir, f"train-{language}.tsv"),
-                self.get_labels()))
-    return examples
-
-  def get_dev_examples(self, data_dir):
-    examples = _read_one_file(
-        os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
-    if not self.only_use_en_dev:
-      for language in self.supported_languages:
-        if language == "en":
-          continue
-        examples.extend(
-            _read_one_file(
-                os.path.join(data_dir, f"dev-{language}.tsv"),
-                self.get_labels()))
-    return examples
-
-  def get_test_examples(self, data_dir):
-    examples_dict = {}
-    for language in self.supported_languages:
-      examples_dict[language] = _read_one_file(
-          os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
-    return examples_dict
-
-  def get_labels(self):
-    return ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"]
-
-  @staticmethod
-  def get_processor_name():
-    return "panx"
-
-
-class UdposProcessor(classifier_data_lib.DataProcessor):
-  """Processor for the Udpos data set."""
-  supported_languages = [
-      "af", "ar", "bg", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr",
-      "he", "hi", "hu", "id", "it", "ja", "kk", "ko", "mr", "nl", "pt", "ru",
-      "ta", "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"
-  ]
-
-  def __init__(self,
-               process_text_fn=tokenization.convert_to_unicode,
-               only_use_en_train=True,
-               only_use_en_dev=True):
-    """See base class.
-
-    Args:
-      process_text_fn: See base class.
-      only_use_en_train: If True, only use english training data. Otherwise, use
-        training data from all languages.
-      only_use_en_dev: If True, only use english dev data. Otherwise, use dev
-        data from all languages.
-    """
-    super(UdposProcessor, self).__init__(process_text_fn)
-    self.only_use_en_train = only_use_en_train
-    self.only_use_en_dev = only_use_en_dev
-
-  def get_train_examples(self, data_dir):
-    if self.only_use_en_train:
-      examples = _read_one_file(
-          os.path.join(data_dir, "train-en.tsv"), self.get_labels())
-    else:
-      examples = []
-      # Uses glob because some languages are missing in train.
-      for filepath in tf.io.gfile.glob(os.path.join(data_dir, "train-*.tsv")):
-        examples.extend(
-            _read_one_file(
-                filepath,
-                self.get_labels()))
-    return examples
-
-  def get_dev_examples(self, data_dir):
-    if self.only_use_en_dev:
-      examples = _read_one_file(
-          os.path.join(data_dir, "dev-en.tsv"), self.get_labels())
-    else:
-      examples = []
-      for filepath in tf.io.gfile.glob(os.path.join(data_dir, "dev-*.tsv")):
-        examples.extend(
-            _read_one_file(
-                filepath,
-                self.get_labels()))
-    return examples
-
-  def get_test_examples(self, data_dir):
-    examples_dict = {}
-    for language in self.supported_languages:
-      examples_dict[language] = _read_one_file(
-          os.path.join(data_dir, "test-%s.tsv" % language), self.get_labels())
-    return examples_dict
-
-  def get_labels(self):
-    return [
-        "ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ", "NOUN", "NUM",
-        "PART", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "X"
-    ]
-
-  @staticmethod
-  def get_processor_name():
-    return "udpos"
-
-
-def _tokenize_example(example, max_length, tokenizer, text_preprocessing=None):
-  """Tokenizes words and breaks long example into short ones."""
-  # Needs additional [CLS] and [SEP] tokens.
-  max_length = max_length - 2
-  new_examples = []
-  new_example = InputExample(sentence_id=example.sentence_id, sub_sentence_id=0)
-  if any([x < 0 for x in example.label_ids]):
-    raise ValueError("Unexpected negative label_id: %s" % example.label_ids)
-
-  for i, word in enumerate(example.words):
-    if text_preprocessing:
-      word = text_preprocessing(word)
-    subwords = tokenizer.tokenize(word)
-    if (not subwords or len(subwords) > max_length) and word:
-      subwords = [_UNK_TOKEN]
-
-    if len(subwords) + len(new_example.words) > max_length:
-      # Start a new example.
-      new_examples.append(new_example)
-      last_sub_sentence_id = new_example.sub_sentence_id
-      new_example = InputExample(
-          sentence_id=example.sentence_id,
-          sub_sentence_id=last_sub_sentence_id + 1)
-
-    for j, subword in enumerate(subwords):
-      # Use the real label for the first subword, and pad label for
-      # the remainings.
-      subword_label = example.label_ids[i] if j == 0 else _PADDING_LABEL_ID
-      new_example.add_word_and_label_id(subword, subword_label)
-
-  if new_example.words:
-    new_examples.append(new_example)
-
-  return new_examples
-
-
-def _convert_single_example(example, max_seq_length, tokenizer):
-  """Converts an `InputExample` instance to a `tf.train.Example` instance."""
-  tokens = ["[CLS]"]
-  tokens.extend(example.words)
-  tokens.append("[SEP]")
-  input_ids = tokenizer.convert_tokens_to_ids(tokens)
-  label_ids = [_PADDING_LABEL_ID]
-  label_ids.extend(example.label_ids)
-  label_ids.append(_PADDING_LABEL_ID)
-
-  segment_ids = [0] * len(input_ids)
-  input_mask = [1] * len(input_ids)
-
-  # Pad up to the sequence length.
-  while len(input_ids) < max_seq_length:
-    input_ids.append(0)
-    input_mask.append(0)
-    segment_ids.append(0)
-    label_ids.append(_PADDING_LABEL_ID)
-
-  def create_int_feature(values):
-    return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-
-  features = collections.OrderedDict()
-  features["input_ids"] = create_int_feature(input_ids)
-  features["input_mask"] = create_int_feature(input_mask)
-  features["segment_ids"] = create_int_feature(segment_ids)
-  features["label_ids"] = create_int_feature(label_ids)
-  features["sentence_id"] = create_int_feature([example.sentence_id])
-  features["sub_sentence_id"] = create_int_feature([example.sub_sentence_id])
-
-  tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-  return tf_example
-
-
-def write_example_to_file(examples,
-                          tokenizer,
-                          max_seq_length,
-                          output_file,
-                          text_preprocessing=None):
-  """Writes `InputExample`s into a tfrecord file with `tf.train.Example` protos.
-
-  Note that the words inside each example will be tokenized and be applied by
-  `text_preprocessing` if available. Also, if the length of sentence (plus
-  special [CLS] and [SEP] tokens) exceeds `max_seq_length`, the long sentence
-  will be broken into multiple short examples. For example:
-
-  Example (text_preprocessing=lowercase, max_seq_length=5)
-    words:        ["What", "a", "great", "weekend"]
-    labels:       [     7,   5,       9,        10]
-    sentence_id:  0
-    preprocessed: ["what", "a", "great", "weekend"]
-    tokenized:    ["what", "a", "great", "week", "##end"]
-
-  will result in two tf.example protos:
-
-    tokens:      ["[CLS]", "what", "a", "great", "[SEP]"]
-    label_ids:   [-1,       7,     5,     9,     -1]
-    input_mask:  [ 1,       1,     1,     1,      1]
-    segment_ids: [ 0,       0,     0,     0,      0]
-    input_ids:   [ tokenizer.convert_tokens_to_ids(tokens) ]
-    sentence_id: 0
-
-    tokens:      ["[CLS]", "week", "##end", "[SEP]", "[PAD]"]
-    label_ids:   [-1,       10,     -1,    -1,       -1]
-    input_mask:  [ 1,       1,       1,     0,        0]
-    segment_ids: [ 0,       0,       0,     0,        0]
-    input_ids:   [ tokenizer.convert_tokens_to_ids(tokens) ]
-    sentence_id: 0
-
-    Note the use of -1 in `label_ids` to indicate that a token should not be
-    considered for classification (e.g., trailing ## wordpieces or special
-    token). Token classification models should accordingly ignore these when
-    calculating loss, metrics, etc...
-
-  Args:
-    examples: A list of `InputExample` instances.
-    tokenizer: The tokenizer to be applied on the data.
-    max_seq_length: Maximum length of generated sequences.
-    output_file: The name of the output tfrecord file.
-    text_preprocessing: optional preprocessing run on each word prior to
-      tokenization.
-
-  Returns:
-    The total number of tf.train.Example proto written to file.
-  """
-  tf.io.gfile.makedirs(os.path.dirname(output_file))
-  writer = tf.io.TFRecordWriter(output_file)
-  num_tokenized_examples = 0
-  for (ex_index, example) in enumerate(examples):
-    if ex_index % 10000 == 0:
-      logging.info("Writing example %d of %d to %s", ex_index, len(examples),
-                   output_file)
-
-    tokenized_examples = _tokenize_example(example, max_seq_length, tokenizer,
-                                           text_preprocessing)
-    num_tokenized_examples += len(tokenized_examples)
-    for per_tokenized_example in tokenized_examples:
-      tf_example = _convert_single_example(per_tokenized_example,
-                                           max_seq_length, tokenizer)
-      writer.write(tf_example.SerializeToString())
-
-  writer.close()
-  return num_tokenized_examples
-
-
-def token_classification_meta_data(train_data_size,
-                                   max_seq_length,
-                                   num_labels,
-                                   eval_data_size=None,
-                                   test_data_size=None,
-                                   label_list=None,
-                                   processor_type=None):
-  """Creates metadata for tagging (token classification) datasets."""
-  meta_data = {
-      "train_data_size": train_data_size,
-      "max_seq_length": max_seq_length,
-      "num_labels": num_labels,
-      "task_type": "tagging",
-      "label_type": "int",
-      "label_shape": [max_seq_length],
-  }
-  if eval_data_size:
-    meta_data["eval_data_size"] = eval_data_size
-  if test_data_size:
-    meta_data["test_data_size"] = test_data_size
-  if label_list:
-    meta_data["label_list"] = label_list
-  if processor_type:
-    meta_data["processor_type"] = processor_type
-
-  return meta_data
-
-
-def generate_tf_record_from_data_file(processor, data_dir, tokenizer,
-                                      max_seq_length, train_data_output_path,
-                                      eval_data_output_path,
-                                      test_data_output_path,
-                                      text_preprocessing):
-  """Generates tfrecord files from the raw data."""
-  common_kwargs = dict(
-      tokenizer=tokenizer,
-      max_seq_length=max_seq_length,
-      text_preprocessing=text_preprocessing)
-  train_examples = processor.get_train_examples(data_dir)
-  train_data_size = write_example_to_file(
-      train_examples, output_file=train_data_output_path, **common_kwargs)
-
-  eval_examples = processor.get_dev_examples(data_dir)
-  eval_data_size = write_example_to_file(
-      eval_examples, output_file=eval_data_output_path, **common_kwargs)
-
-  test_input_data_examples = processor.get_test_examples(data_dir)
-  test_data_size = {}
-  for language, examples in test_input_data_examples.items():
-    test_data_size[language] = write_example_to_file(
-        examples,
-        output_file=test_data_output_path.format(language),
-        **common_kwargs)
-
-  labels = processor.get_labels()
-  meta_data = token_classification_meta_data(
-      train_data_size,
-      max_seq_length,
-      len(labels),
-      eval_data_size,
-      test_data_size,
-      label_list=labels,
-      processor_type=processor.get_processor_name())
-  return meta_data
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_data_lib_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_data_lib_test.py
deleted file mode 100644
index 7c3db374a50dbdb76ea5161f5ade6604ea2241a8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_data_lib_test.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.data.tagging_data_lib."""
-import os
-import random
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.nlp.bert import tokenization
-from official.nlp.data import tagging_data_lib
-
-
-def _create_fake_file(filename, labels, is_test):
-
-  def write_one_sentence(writer, length):
-    for _ in range(length):
-      line = "hiworld"
-      if not is_test:
-        line += "\t%s" % (labels[random.randint(0, len(labels) - 1)])
-      writer.write(line + "\n")
-
-  # Writes two sentences with length of 3 and 12 respectively.
-  with tf.io.gfile.GFile(filename, "w") as writer:
-    write_one_sentence(writer, 3)
-    writer.write("\n")
-    write_one_sentence(writer, 12)
-
-
-class TaggingDataLibTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(TaggingDataLibTest, self).setUp()
-
-    self.processors = {
-        "panx": tagging_data_lib.PanxProcessor,
-        "udpos": tagging_data_lib.UdposProcessor,
-    }
-    self.vocab_file = os.path.join(self.get_temp_dir(), "vocab.txt")
-    with tf.io.gfile.GFile(self.vocab_file, "w") as writer:
-      writer.write("\n".join(["[CLS]", "[SEP]", "hi", "##world", "[UNK]"]))
-
-  @parameterized.parameters(
-      {"task_type": "panx"},
-      {"task_type": "udpos"},
-  )
-  def test_generate_tf_record(self, task_type):
-    processor = self.processors[task_type]()
-    input_data_dir = os.path.join(self.get_temp_dir(), task_type)
-    tf.io.gfile.mkdir(input_data_dir)
-    # Write fake train file.
-    _create_fake_file(
-        os.path.join(input_data_dir, "train-en.tsv"),
-        processor.get_labels(),
-        is_test=False)
-
-    # Write fake dev file.
-    _create_fake_file(
-        os.path.join(input_data_dir, "dev-en.tsv"),
-        processor.get_labels(),
-        is_test=False)
-
-    # Write fake test files.
-    for lang in processor.supported_languages:
-      _create_fake_file(
-          os.path.join(input_data_dir, "test-%s.tsv" % lang),
-          processor.get_labels(),
-          is_test=True)
-
-    output_path = os.path.join(self.get_temp_dir(), task_type, "output")
-    tokenizer = tokenization.FullTokenizer(
-        vocab_file=self.vocab_file, do_lower_case=True)
-    metadata = tagging_data_lib.generate_tf_record_from_data_file(
-        processor,
-        input_data_dir,
-        tokenizer,
-        max_seq_length=8,
-        train_data_output_path=os.path.join(output_path, "train.tfrecord"),
-        eval_data_output_path=os.path.join(output_path, "eval.tfrecord"),
-        test_data_output_path=os.path.join(output_path, "test_{}.tfrecord"),
-        text_preprocessing=tokenization.convert_to_unicode)
-
-    self.assertEqual(metadata["train_data_size"], 5)
-    files = tf.io.gfile.glob(output_path + "/*")
-    expected_files = []
-    expected_files.append(os.path.join(output_path, "train.tfrecord"))
-    expected_files.append(os.path.join(output_path, "eval.tfrecord"))
-    for lang in processor.supported_languages:
-      expected_files.append(
-          os.path.join(output_path, "test_%s.tfrecord" % lang))
-
-    self.assertCountEqual(files, expected_files)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_dataloader.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_dataloader.py
deleted file mode 100644
index 12a71c6d63c5d95a4bc643d4d99e38e0a9327ab0..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_dataloader.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Loads dataset for the tagging (e.g., NER/POS) task."""
-from typing import Mapping, Optional
-
-import dataclasses
-import tensorflow as tf
-from official.core import config_definitions as cfg
-from official.core import input_reader
-from official.nlp.data import data_loader
-from official.nlp.data import data_loader_factory
-
-
-@dataclasses.dataclass
-class TaggingDataConfig(cfg.DataConfig):
-  """Data config for tagging (tasks/tagging)."""
-  is_training: bool = True
-  seq_length: int = 128
-  include_sentence_id: bool = False
-
-
-@data_loader_factory.register_data_loader_cls(TaggingDataConfig)
-class TaggingDataLoader(data_loader.DataLoader):
-  """A class to load dataset for tagging (e.g., NER and POS) task."""
-
-  def __init__(self, params: TaggingDataConfig):
-    self._params = params
-    self._seq_length = params.seq_length
-    self._include_sentence_id = params.include_sentence_id
-
-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
-    name_to_features = {
-        'input_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'input_mask': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'segment_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-        'label_ids': tf.io.FixedLenFeature([self._seq_length], tf.int64),
-    }
-    if self._include_sentence_id:
-      name_to_features['sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
-      name_to_features['sub_sentence_id'] = tf.io.FixedLenFeature([], tf.int64)
-
-    example = tf.io.parse_single_example(record, name_to_features)
-
-    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-    # So cast all int64 to int32.
-    for name in example:
-      t = example[name]
-      if t.dtype == tf.int64:
-        t = tf.cast(t, tf.int32)
-      example[name] = t
-
-    return example
-
-  def _parse(self, record: Mapping[str, tf.Tensor]):
-    """Parses raw tensors into a dict of tensors to be consumed by the model."""
-    x = {
-        'input_word_ids': record['input_ids'],
-        'input_mask': record['input_mask'],
-        'input_type_ids': record['segment_ids']
-    }
-    if self._include_sentence_id:
-      x['sentence_id'] = record['sentence_id']
-      x['sub_sentence_id'] = record['sub_sentence_id']
-
-    y = record['label_ids']
-    return (x, y)
-
-  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
-    """Returns a tf.dataset.Dataset."""
-    reader = input_reader.InputReader(
-        params=self._params, decoder_fn=self._decode, parser_fn=self._parse)
-    return reader.read(input_context)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_dataloader_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_dataloader_test.py
deleted file mode 100644
index 7b373928a66a9eac88b1c82a5b1da934eefcd5fa..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/tagging_dataloader_test.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.data.tagging_data_loader."""
-import os
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.data import tagging_dataloader
-
-
-def _create_fake_dataset(output_path, seq_length, include_sentence_id):
-  """Creates a fake dataset."""
-  writer = tf.io.TFRecordWriter(output_path)
-
-  def create_int_feature(values):
-    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-    return f
-
-  for i in range(100):
-    features = {}
-    input_ids = np.random.randint(100, size=(seq_length))
-    features['input_ids'] = create_int_feature(input_ids)
-    features['input_mask'] = create_int_feature(np.ones_like(input_ids))
-    features['segment_ids'] = create_int_feature(np.ones_like(input_ids))
-    features['label_ids'] = create_int_feature(
-        np.random.randint(10, size=(seq_length)))
-    if include_sentence_id:
-      features['sentence_id'] = create_int_feature([i])
-      features['sub_sentence_id'] = create_int_feature([0])
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-
-
-class TaggingDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters(True, False)
-  def test_load_dataset(self, include_sentence_id):
-    seq_length = 16
-    batch_size = 10
-    train_data_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
-    _create_fake_dataset(train_data_path, seq_length, include_sentence_id)
-    data_config = tagging_dataloader.TaggingDataConfig(
-        input_path=train_data_path,
-        seq_length=seq_length,
-        global_batch_size=batch_size,
-        include_sentence_id=include_sentence_id)
-
-    dataset = tagging_dataloader.TaggingDataLoader(data_config).load()
-    features, labels = next(iter(dataset))
-
-    expected_keys = ['input_word_ids', 'input_mask', 'input_type_ids']
-    if include_sentence_id:
-      expected_keys.extend(['sentence_id', 'sub_sentence_id'])
-    self.assertCountEqual(expected_keys, features.keys())
-
-    self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_mask'].shape, (batch_size, seq_length))
-    self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length))
-    self.assertEqual(labels.shape, (batch_size, seq_length))
-    if include_sentence_id:
-      self.assertEqual(features['sentence_id'].shape, (batch_size,))
-      self.assertEqual(features['sub_sentence_id'].shape, (batch_size,))
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/train_sentencepiece.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/train_sentencepiece.py
deleted file mode 100644
index 7482ce540c2f544d9bb0312bd65c862c29190b5e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/train_sentencepiece.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A script to train sentencepiece model from tensorflow datasets.
-
-Reserved tokens:
-pad: 0,
-eos: 1,
-unk: 2
-(bos is not reserved)
-"""
-
-import os
-import tempfile
-from typing import List, Tuple
-
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-import tensorflow_datasets as tfds
-
-from sentencepiece import SentencePieceTrainer
-
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string("output_model_path", None,
-                    "Path to save the the sentencepiece model.")
-flags.mark_flag_as_required("output_model_path")
-
-flags.DEFINE_string("tfds_dir", None, "Directory of the tfds.")
-flags.DEFINE_string("tfds_name", "wmt14_translate/de-en",
-                    "Name of the dataset we generate vacabulay from.")
-flags.DEFINE_string("tfds_split", "train", "Split of the dataset.")
-flags.DEFINE_integer("vocab_size", 32000, "Size of vocabulary.")
-flags.DEFINE_integer(
-    "max_char", -1,
-    "Maximum number of characters to use. "
-    "If a non-positive number is provided, all sentences are used.")
-flags.DEFINE_string("model_type", "bpe",
-                    "Model algorithm: unigram, bpe, word or char.")
-flags.DEFINE_float("character_coverage", 0.9995,
-                   "Character coverage to determine the minimum symbols")
-flags.DEFINE_list(
-    "data_keys", ["en", "de"],
-    "Comma-separated list of keys to use for training the vocabulary.")
-
-
-def dump_chars_to_textfile(dataset: tf.data.Dataset,
-                           data_keys: Tuple[str],
-                           max_char: int = -1):
-  """Write part of a TFDS sentence dataset to lines in a text file.
-
-  Args:
-    dataset: tf.dataset containing string-data.
-    data_keys: what keys in dataset to dump from.
-    max_char: max character to dump to text file.
-
-  Returns:
-    name of temp file with dataset bytes, exact number of characters dumped.
-  """
-  ds_iter = dataset.as_numpy_iterator()
-  with tempfile.NamedTemporaryFile(delete=False) as outfp:
-    char_count = 0
-    while True:
-      example = next(ds_iter, None)
-      if example is None or (
-          max_char > 0 and char_count > max_char):
-        break
-      for k in data_keys:
-        line = example[k] + b"\n"
-        char_count += len(line)
-        outfp.write(line)
-  return outfp.name
-
-
-def train_sentencepiece(
-    file_path: str,
-    model_path: str,
-    vocab_size: int,
-    character_coverage: float,
-    model_type: str):
-  """Train SentencePiece tokenizer from subset of tf dataset.
-
-  Args:
-    file_path: path of data to train sentencepiece.
-    model_path: path of model file to save vocab model to.
-    vocab_size: size of vocab tokens to train.
-    character_coverage: amount of characters covered by the model, good defaults
-      are 0.9995 for languages with rich character set like Japanese or Chinese
-      and 1.0 for other languages with small character set.
-    model_type: type of sentencepiece vocab to train.
-
-  Returns:
-    path to the trained sentencepiece vocabulary model.
-  """
-  argstr = " ".join([
-      f"--input={file_path}", f"--vocab_size={vocab_size}",
-      f"--character_coverage={character_coverage}",
-      f"--model_prefix={model_path}", f"--model_type={model_type}",
-      "--bos_id=-1", "--pad_id=0", "--eos_id=1", "--unk_id=2"
-  ])
-  SentencePieceTrainer.Train(argstr)
-
-
-def main(argv: List[str]):
-  del argv
-  builder = tfds.builder(FLAGS.tfds_name, data_dir=FLAGS.tfds_dir)
-  ds = builder.as_dataset(split=FLAGS.tfds_split)
-  tmp_filename = dump_chars_to_textfile(ds, FLAGS.data_keys, FLAGS.max_char)
-  logging.info("Sentencepiece model will be placed here: %s",
-               FLAGS.output_model_path)
-  train_sentencepiece(tmp_filename,
-                      FLAGS.output_model_path,
-                      FLAGS.vocab_size,
-                      FLAGS.character_coverage,
-                      FLAGS.model_type)
-  os.remove(tmp_filename)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/wmt_dataloader.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/wmt_dataloader.py
deleted file mode 100644
index 6e8f2c6024b697e37ec51d497200a7c431c0f160..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/wmt_dataloader.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Input pipeline for the transformer model to read, filter, and batch examples.
-
-Batching scheme
-
-   Prior to batching, elements in the dataset are grouped by length (max between
-   'inputs' and 'targets' length). Each group is then batched such that:
-     group_batch_size * length <= batch_size.
-
-   Another way to view batch_size is the maximum number of tokens in each batch.
-
-   Once batched, each element in the dataset will have the shape:
-     {'inputs': [group_batch_size, padded_input_length],
-      'targets': [group_batch_size, padded_target_length]}
-   Lengths are padded to the longest 'inputs' or 'targets' sequence in the batch
-   (padded_input_length and padded_target_length can be different).
-
-   This batching scheme decreases the fraction of padding tokens per training
-   batch, thus improving the training speed significantly.
-"""
-from typing import Dict, Optional
-
-import dataclasses
-import tensorflow as tf
-import tensorflow_text as tftxt
-from official.core import config_definitions as cfg
-from official.core import input_reader
-from official.nlp.data import data_loader
-from official.nlp.data import data_loader_factory
-
-# Example grouping constants. Defines length boundaries for each group.
-# These values are the defaults used in Tensor2Tensor.
-_MIN_BOUNDARY = 8
-_BOUNDARY_SCALE = 1.1
-
-
-def _get_example_length(example):
-  """Returns the maximum length between the example inputs and targets."""
-  length = tf.maximum(tf.shape(example[0])[0], tf.shape(example[1])[0])
-  return length
-
-
-def _create_min_max_boundaries(max_length,
-                               min_boundary=_MIN_BOUNDARY,
-                               boundary_scale=_BOUNDARY_SCALE):
-  """Create min and max boundary lists up to max_length.
-
-  For example, when max_length=24, min_boundary=4 and boundary_scale=2, the
-  returned values will be:
-    buckets_min = [0, 4, 8, 16]
-    buckets_max = [4, 8, 16, 25]
-
-  Args:
-    max_length: The maximum length of example in dataset.
-    min_boundary: Minimum length in boundary.
-    boundary_scale: Amount to scale consecutive boundaries in the list.
-
-  Returns:
-    min and max boundary lists
-
-  """
-  # Create bucket boundaries list by scaling the previous boundary or adding 1
-  # (to ensure increasing boundary sizes).
-  bucket_boundaries = []
-  x = min_boundary
-  while x < max_length:
-    bucket_boundaries.append(x)
-    x = max(x + 1, int(x * boundary_scale))
-
-  # Create min and max boundary lists from the initial list.
-  buckets_min = [0] + bucket_boundaries
-  buckets_max = bucket_boundaries + [max_length + 1]
-  return buckets_min, buckets_max
-
-
-def _batch_examples(dataset, batch_size, max_length):
-  """Group examples by similar lengths, and return batched dataset.
-
-  Each batch of similar-length examples are padded to the same length, and may
-  have different number of elements in each batch, such that:
-    group_batch_size * padded_length <= batch_size.
-
-  This decreases the number of padding tokens per batch, which improves the
-  training speed.
-
-  Args:
-    dataset: Dataset of unbatched examples.
-    batch_size: Max number of tokens per batch of examples.
-    max_length: Max number of tokens in an example input or target sequence.
-
-  Returns:
-    Dataset of batched examples with similar lengths.
-  """
-  # Get min and max boundary lists for each example. These are used to calculate
-  # the `bucket_id`, which is the index at which:
-  # buckets_min[bucket_id] <= len(example) < buckets_max[bucket_id]
-  # Note that using both min and max lists improves the performance.
-  buckets_min, buckets_max = _create_min_max_boundaries(max_length)
-
-  # Create list of batch sizes for each bucket_id, so that
-  # bucket_batch_size[bucket_id] * buckets_max[bucket_id] <= batch_size
-  bucket_batch_sizes = [int(batch_size) // x for x in buckets_max]
-
-  # Validates bucket batch sizes.
-  if any([batch_size <= 0 for batch_size in bucket_batch_sizes]):
-    raise ValueError(
-        'The token budget, global batch size, is too small to yeild 0 bucket '
-        'window: %s' % str(bucket_batch_sizes))
-
-  # bucket_id will be a tensor, so convert this list to a tensor as well.
-  bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64)
-
-  def example_to_bucket_id(example):
-    """Return int64 bucket id for this example, calculated based on length."""
-    example_input = example['inputs']
-    example_target = example['targets']
-    seq_length = _get_example_length((example_input, example_target))
-
-    conditions_c = tf.logical_and(
-        tf.less_equal(buckets_min, seq_length), tf.less(seq_length,
-                                                        buckets_max))
-    bucket_id = tf.reduce_min(tf.where(conditions_c))
-    return bucket_id
-
-  def window_size_fn(bucket_id):
-    """Return number of examples to be grouped when given a bucket id."""
-    return bucket_batch_sizes[bucket_id]
-
-  def batching_fn(bucket_id, grouped_dataset):
-    """Batch and add padding to a dataset of elements with similar lengths."""
-    bucket_batch_size = window_size_fn(bucket_id)
-
-    # Batch the dataset and add padding so that all input sequences in the
-    # examples have the same length, and all target sequences have the same
-    # lengths as well. Resulting lengths of inputs and targets can differ.
-    padded_shapes = dict([
-        (name, [None] * len(spec.shape))
-        for name, spec in grouped_dataset.element_spec.items()
-    ])
-    return grouped_dataset.padded_batch(bucket_batch_size, padded_shapes)
-
-  return dataset.apply(
-      tf.data.experimental.group_by_window(
-          key_func=example_to_bucket_id,
-          reduce_func=batching_fn,
-          window_size=None,
-          window_size_func=window_size_fn))
-
-
-@dataclasses.dataclass
-class WMTDataConfig(cfg.DataConfig):
-  """Data config for WMT translation."""
-  max_seq_length: int = 64
-  static_batch: bool = False
-  sentencepiece_model_path: str = ''
-  src_lang: str = ''
-  tgt_lang: str = ''
-  transform_and_batch: bool = True
-  has_unique_id: bool = False
-
-
-@data_loader_factory.register_data_loader_cls(WMTDataConfig)
-class WMTDataLoader(data_loader.DataLoader):
-  """A class to load dataset for WMT translation task."""
-
-  def __init__(self, params: WMTDataConfig):
-    self._params = params
-    self._max_seq_length = params.max_seq_length
-    self._static_batch = params.static_batch
-    self._global_batch_size = params.global_batch_size
-    if self._params.transform_and_batch:
-      self._tokenizer = tftxt.SentencepieceTokenizer(
-          model=tf.io.gfile.GFile(params.sentencepiece_model_path, 'rb').read(),
-          add_eos=True)
-
-  def _decode(self, record: tf.Tensor):
-    """Decodes a serialized tf.Example."""
-    name_to_features = {
-        self._params.src_lang: tf.io.FixedLenFeature([], tf.string),
-        self._params.tgt_lang: tf.io.FixedLenFeature([], tf.string),
-    }
-    if self._params.has_unique_id:
-      name_to_features['unique_id'] = tf.io.FixedLenFeature([], tf.int64)
-    example = tf.io.parse_single_example(record, name_to_features)
-    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-    # So cast all int64 to int32.
-    for name in example:
-      t = example[name]
-      if t.dtype == tf.int64:
-        t = tf.cast(t, tf.int32)
-      example[name] = t
-    return example
-
-  def _tokenize(self, inputs) -> Dict[str, tf.Tensor]:
-    tokenized_inputs = {}
-    for k, v in inputs.items():
-      if k == self._params.src_lang:
-        tokenized_inputs['inputs'] = self._tokenizer.tokenize(v)
-      elif k == self._params.tgt_lang:
-        tokenized_inputs['targets'] = self._tokenizer.tokenize(v)
-      else:
-        tokenized_inputs[k] = v
-    print(tokenized_inputs)
-    return tokenized_inputs
-
-  def _filter_max_length(self, inputs):
-    # return tf.constant(True)
-    return tf.logical_and(
-        tf.shape(inputs['inputs'])[0] <= self._max_seq_length,
-        tf.shape(inputs['targets'])[0] <= self._max_seq_length)
-
-  def _maybe_truncate(self, inputs):
-    truncated_inputs = {}
-    for k, v in inputs.items():
-      if k == 'inputs' or k == 'targets':
-        truncated_inputs[k] = tf.pad(
-            v[:self._max_seq_length - 1], [[0, 1]],
-            constant_values=1) if tf.shape(v)[0] > self._max_seq_length else v
-      else:
-        truncated_inputs[k] = v
-    return truncated_inputs
-
-  def _tokenize_bucketize_and_batch(
-      self,
-      dataset,
-      input_context: Optional[tf.distribute.InputContext] = None):
-    dataset = dataset.map(
-        self._tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-    if self._params.is_training:
-      dataset = dataset.filter(self._filter_max_length)
-    else:
-      dataset = dataset.map(
-          self._maybe_truncate,
-          num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-    per_replica_batch_size = input_context.get_per_replica_batch_size(
-        self._global_batch_size) if input_context else self._global_batch_size
-    if self._static_batch:
-      padded_shapes = {}
-      for name, _ in dataset.element_spec.items():
-        if name == 'unique_id':
-          padded_shapes[name] = []
-        else:
-          padded_shapes[name] = [self._max_seq_length
-                                ] if self._static_batch else [None]
-      batch_size = per_replica_batch_size
-      if self._params.is_training:
-        batch_size = int(batch_size // self._max_seq_length)
-      dataset = dataset.padded_batch(
-          batch_size,
-          padded_shapes,
-          drop_remainder=True)
-    else:
-      # Group and batch such that each batch has examples of similar length.
-      dataset = _batch_examples(dataset, per_replica_batch_size,
-                                self._max_seq_length)
-    # Prefetch the next element to improve speed of input pipeline.
-    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
-    return dataset
-
-  def load(self, input_context: Optional[tf.distribute.InputContext] = None):
-    """Returns a tf.dataset.Dataset."""
-    decoder_fn = None
-    # Only decode for TFRecords.
-    if self._params.input_path:
-      decoder_fn = self._decode
-
-    def _identity(
-        dataset, input_context: Optional[tf.distribute.InputContext] = None):
-      del input_context
-      return dataset
-
-    transform_and_batch_fn = _identity
-    if self._params.transform_and_batch:
-      transform_and_batch_fn = self._tokenize_bucketize_and_batch
-
-    reader = input_reader.InputReader(
-        params=self._params,
-        decoder_fn=decoder_fn,
-        transform_and_batch_fn=transform_and_batch_fn)
-    return reader.read(input_context)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/wmt_dataloader_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/wmt_dataloader_test.py
deleted file mode 100644
index 4798af079eade0584c68323111c0e4cbfa179255..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/data/wmt_dataloader_test.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.data.wmt_dataloader."""
-import os
-from absl.testing import parameterized
-
-import tensorflow as tf
-
-from sentencepiece import SentencePieceTrainer
-from official.nlp.data import wmt_dataloader
-
-
-def _generate_line_file(filepath, lines):
-  with tf.io.gfile.GFile(filepath, 'w') as f:
-    for l in lines:
-      f.write('{}\n'.format(l))
-
-
-def _generate_record_file(filepath, src_lines, tgt_lines, unique_id=False):
-  writer = tf.io.TFRecordWriter(filepath)
-  for i, (src, tgt) in enumerate(zip(src_lines, tgt_lines)):
-    features = {
-        'en': tf.train.Feature(
-            bytes_list=tf.train.BytesList(
-                value=[src.encode()])),
-        'reverse_en': tf.train.Feature(
-            bytes_list=tf.train.BytesList(
-                value=[tgt.encode()])),
-    }
-    if unique_id:
-      features['unique_id'] = tf.train.Feature(
-          int64_list=tf.train.Int64List(value=[i])),
-    example = tf.train.Example(
-        features=tf.train.Features(
-            feature=features))
-    writer.write(example.SerializeToString())
-  writer.close()
-
-
-def _train_sentencepiece(input_path, vocab_size, model_path, eos_id=1):
-  argstr = ' '.join([
-      f'--input={input_path}', f'--vocab_size={vocab_size}',
-      '--character_coverage=0.995',
-      f'--model_prefix={model_path}', '--model_type=bpe',
-      '--bos_id=-1', '--pad_id=0', f'--eos_id={eos_id}', '--unk_id=2'
-  ])
-  SentencePieceTrainer.Train(argstr)
-
-
-class WMTDataLoaderTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(WMTDataLoaderTest, self).setUp()
-    self._temp_dir = self.get_temp_dir()
-    src_lines = [
-        'abc ede fg',
-        'bbcd ef a g',
-        'de f a a g'
-    ]
-    tgt_lines = [
-        'dd cc a ef  g',
-        'bcd ef a g',
-        'gef cd ba'
-    ]
-    self._record_train_input_path = os.path.join(self._temp_dir, 'train.record')
-    _generate_record_file(self._record_train_input_path, src_lines, tgt_lines)
-    self._record_test_input_path = os.path.join(self._temp_dir, 'test.record')
-    _generate_record_file(self._record_test_input_path, src_lines, tgt_lines,
-                          unique_id=True)
-    self._sentencepeice_input_path = os.path.join(self._temp_dir, 'inputs.txt')
-    _generate_line_file(self._sentencepeice_input_path, src_lines + tgt_lines)
-    sentencepeice_model_prefix = os.path.join(self._temp_dir, 'sp')
-    _train_sentencepiece(self._sentencepeice_input_path, 20,
-                         sentencepeice_model_prefix)
-    self._sentencepeice_model_path = '{}.model'.format(
-        sentencepeice_model_prefix)
-
-  @parameterized.named_parameters(
-      ('train_static', True, True, 100, (2, 35)),
-      ('train_non_static', True, False, 100, (12, 7)),
-      ('non_train_static', False, True, 3, (3, 35)),
-      ('non_train_non_static', False, False, 50, (2, 7)),)
-  def test_load_dataset(
-      self, is_training, static_batch, batch_size, expected_shape):
-    data_config = wmt_dataloader.WMTDataConfig(
-        input_path=self._record_train_input_path
-        if is_training else self._record_test_input_path,
-        max_seq_length=35,
-        global_batch_size=batch_size,
-        is_training=is_training,
-        static_batch=static_batch,
-        src_lang='en',
-        tgt_lang='reverse_en',
-        sentencepiece_model_path=self._sentencepeice_model_path)
-    dataset = wmt_dataloader.WMTDataLoader(data_config).load()
-    examples = next(iter(dataset))
-    inputs, targets = examples['inputs'], examples['targets']
-    self.assertEqual(inputs.shape, expected_shape)
-    self.assertEqual(targets.shape, expected_shape)
-
-  def test_load_dataset_raise_invalid_window(self):
-    batch_tokens_size = 10  # this is too small to form buckets.
-    data_config = wmt_dataloader.WMTDataConfig(
-        input_path=self._record_train_input_path,
-        max_seq_length=100,
-        global_batch_size=batch_tokens_size,
-        is_training=True,
-        static_batch=False,
-        src_lang='en',
-        tgt_lang='reverse_en',
-        sentencepiece_model_path=self._sentencepeice_model_path)
-    with self.assertRaisesRegex(
-        ValueError, 'The token budget, global batch size, is too small.*'):
-      _ = wmt_dataloader.WMTDataLoader(data_config).load()
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/docs/pretrained_models.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/docs/pretrained_models.md
deleted file mode 100644
index 0c836b33b7d5d07932d56303ca61647c3053cee4..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/docs/pretrained_models.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# Pre-trained Models
-
-We provide a large collection of baselines and checkpoints for NLP pre-trained
-models.
-
-## How to Load Pretrained Models
-
-### How to Initialize from Checkpoint
-
-**Note:** TF-HUB/Savedmodel is the preferred way to distribute models as it is
-self-contained. Please consider using TF-HUB for finetuning tasks first.
-
-If you use the [NLP training library](train.md),
-you can specify the checkpoint path link directly when launching your job. For
-example, to initialize the model from the checkpoint, you can specify
-`--params_override=task.init_checkpoint=PATH_TO_INIT_CKPT` as:
-
-```
-python3 train.py \
- --params_override=task.init_checkpoint=PATH_TO_INIT_CKPT
-```
-
-### How to load TF-HUB SavedModel
-
-Finetuning tasks such as question answering (SQuAD) and sentence
-prediction (GLUE) support loading a model from TF-HUB. These built-in tasks
-support a specific `task.hub_module_url` parameter. To set this parameter,
-replace `--params_override=task.init_checkpoint=...` with
-`--params_override=task.hub_module_url=TF_HUB_URL`, like below:
-
-```
-python3 train.py \
- --params_override=task.hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
-```
-
-## BERT
-
-Public BERT pre-trained models released by the BERT authors.
-
-We released both checkpoints and tf.hub modules as the pretrained models for
-fine-tuning. They are TF 2.x compatible and are converted from the checkpoints
-released in TF 1.x official BERT repository
-[google-research/bert](https://github.com/google-research/bert)
-in order to keep consistent with BERT paper.
-
-### Checkpoints
-
-Model                                    | Configuration                | Training Data | Checkpoint & Vocabulary | TF-HUB SavedModels
----------------------------------------- | :--------------------------: | ------------: | ----------------------: | ------:
-BERT-base uncased English                | uncased_L-12_H-768_A-12      | Wiki + Books  | [uncased_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/)
-BERT-base cased English                  | cased_L-12_H-768_A-12        | Wiki + Books  | [cased_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/cased_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/)
-BERT-large uncased English               | uncased_L-24_H-1024_A-16     | Wiki + Books  | [uncased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/uncased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/)
-BERT-large cased English                  | cased_L-24_H-1024_A-16       | Wiki + Books  | [cased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/cased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/)
-BERT-large, Uncased (Whole Word Masking) | wwm_uncased_L-24_H-1024_A-16 | Wiki + Books  | [wwm_uncased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/wwm_uncased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Uncased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/)
-BERT-large, Cased (Whole Word Masking)   | wwm_cased_L-24_H-1024_A-16   | Wiki + Books  | [wwm_cased_L-24_H-1024_A-16](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/wwm_cased_L-24_H-1024_A-16.tar.gz) | [`BERT-Large, Cased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/)
-BERT-base MultiLingual                   | multi_cased_L-12_H-768_A-12  | Wiki + Books  | [multi_cased_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/multi_cased_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Multilingual Cased`](https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/)
-BERT-base Chinese                        | chinese_L-12_H-768_A-12      | Wiki + Books  | [chinese_L-12_H-768_A-12](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/v3/chinese_L-12_H-768_A-12.tar.gz) | [`BERT-Base, Chinese`](https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/)
-
-You may explore more in the TF-Hub BERT collection:
-https://tfhub.dev/google/collections/bert/1
-
-### BERT variants
-
-We also have pretrained BERT models with variants in both network architecture
-and training methodologies. These models achieve higher downstream accuracy
-scores.
-
-Model                            | Configuration            | Training Data            | TF-HUB SavedModels                                                                    | Comment
--------------------------------- | :----------------------: | -----------------------: | ------------------------------------------------------------------------------------: | ------:
-BERT-base talking heads + ggelu  | uncased_L-12_H-768_A-12  | Wiki + Books   | [talkheads_ggelu_base](https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1)   | BERT-base trained with [talking heads attention](https://arxiv.org/abs/2003.02436) and [gated GeLU](https://arxiv.org/abs/2002.05202).
-BERT-large talking heads + ggelu | uncased_L-24_H-1024_A-16 | Wiki + Books  | [talkheads_ggelu_large](https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_large/1) | BERT-large trained with [talking heads attention](https://arxiv.org/abs/2003.02436) and [gated GeLU](https://arxiv.org/abs/2002.05202).
-LAMBERT-large uncased English    | uncased_L-24_H-1024_A-16 | Wiki + Books  | [lambert](https://tfhub.dev/tensorflow/lambert_en_uncased_L-24_H-1024_A-16/1)         | BERT trained with LAMB and techniques from RoBERTa.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/docs/tfhub.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/docs/tfhub.md
deleted file mode 100644
index c6fe9a2f8f4514dfb8c1419abac35e0a993fde9d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/docs/tfhub.md
+++ /dev/null
@@ -1,292 +0,0 @@
-# Exporting a pre-trained Encoder to TF Hub
-
-## Overview
-
-This doc explains how to use TF-NLP's
-[export_tfhub](https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py)
-tool to export pre-trained Transformer encoders to SavedModels suitable for
-publication on TF Hub. (For the steps after that, see TF Hub's
-[publisher guide](https://www.tensorflow.org/hub/publish).)
-For testing purposes, those SavedModels can also be used from their export
-locations on the filesystem.
-
-On TF Hub, Transformer encoders for text come as a pair of SavedModels:
-
-*   The preprocessing model applies a tokenizer with a fixed vocab plus some
-    additional logic to turn text into Transformer inputs.
-*   The encoder model (or "model" for short) applies the pre-trained Transformer
-    encoder.
-
-TF Hub defines
-[Common APIs](https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders)
-for all SavedModels of those two respective types, encapsulating the particular
-choice of preprocessing logic and Encoder architecture.
-
-## Exporting the Encoder
-
-There is a choice between exporting just the encoder, or the encoder plus the
-prediction head for the masked language model (MLM) task from pre-training.
-
-Exporting just the encoder suffices for many straightforward applications.
-
-### Exporting the Encoder alone
-
-To export an encoder-only model, you can set `--export_type=model` and run the
-tool like this:
-
-```shell
-python official/nlp/tools/export_tfhub.py \
-  --encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
-  --model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
-  --vocab_file=${BERT_DIR:?}/vocab.txt \
-  --export_type=model \
-  --export_path=/tmp/bert_model
-```
-
-The flag `--encoder_config_file` refers to a YAML file representing the
-[encoders.EncoderConfig](https://github.com/tensorflow/models/search?q=EncoderConfig+path%3Aofficial%2Fnlp%2Fconfigs+filename%3Aencoders.py)
-dataclass, which supports multiple encoders (e.g., BERT, ALBERT). Instead of
-`--encoder_config_file`, you can set `--bert_config_file` to a legacy
-`bert_config.json` file to export a BERT model. If the model definition involves
-[GIN](https://github.com/google/gin-config), the flags `--gin_file` and
-`--gin_params` must be set accordingly, consistent with pre-training.
-
-The `--model_checkpoint_path` refers to an object-based (TF2) checkpoint written
-by
-[BertPretrainerV2](https://github.com/tensorflow/models/search?q=BertPretrainerV2+filename%3Abert_pretrainer.py),
-or any other checkpoint that can be restored to
-`tf.train.Checkpoint(encoder=encoder)` for the encoder defined by the config
-flags. Legacy checkpoints with `model=` instead of `encoder=` are also supported
-for now.
-
-The exported SavedModel expects dict inputs and outputs as follows, implementing
-a specialization of the respective
-[Common SavedModel API](https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders):
-
-```python
-encoder = hub.load(...)
-encoder_inputs = dict(
-    input_word_ids=...,  # Shape [batch, seq_length], dtype=int32
-    input_mask=...,      # Shape [batch, seq_length], dtype=int32
-    input_type_ids=...,  # Shape [batch, seq_length], dtype=int32
-)
-encoder_outputs = encoder(encoder_inputs)
-assert encoder_outputs.keys() == {
-  "pooled_output",    # Shape [batch_size, width], dtype=float32
-  "default",          # Alias for "pooled_output" (aligns with other models)
-  "sequence_output",  # Shape [batch_size, seq_length, width], dtype=float32
-  "encoder_outputs",  # List of Tensors with outputs of all transformer layers
-}
-```
-
-The encoder's pooler layer is restored from the `--model_checkpoint_path`.
-However, unlike classic BERT, `BertPretrainerV2` does not train the pooler layer
-of the encoder. You have three options to handle that:
-
-*   Set flag `--copy_pooler_dense_to_encoder` to copy the pooling layer from the
-    `ClassificationHead` passed to `BertPretrainerV2` for the next sentence
-    prediction task. This mimicks classic BERT, but is not recommended for new
-    models (see next item).
-*   Leave flag `--copy_pooler_dense_to_encoder` unset and export the untrained,
-    randomly initialized pooling layer of the encoder. Folklore (as of 2020) has
-    it that an untrained pooler gets fine-tuned better than a pre-trained
-    pooler, so this is the default.
-*   Leave flag `--copy_pooler_dense_to_encoder` unset and perform your own
-    initialization of the pooling layer before export. For example, Google's
-    [BERT Experts](https://tfhub.dev/google/collections/experts/bert/1)
-    published in October 2020 initialize it to the identity map, reporting equal
-    gains if fine-tuning, and more predictable behavior if not.
-
-In any case, at this time, the export tool requires the encoder model to *have*
-a `pooled_output`, whether trained or not. (This can be revised in the future.)
-
-The encoder model does not include any preprocessing logic, but for the benefit
-of users who take preprocessing into their own hands, the relevant information
-is attached from flags `--vocab_file` or `--sp_model_file`, resp., and
-`--do_lower_case`, which need to be set in exactly the same way as for the
-preprocessing model (see below).
-
-The root object of the exported SavedModel stores the resulting values as
-attributes on the root object:
-
-```python
-encoder = hub.load(...)
-# Gets the filename of the respective tf.saved_model.Asset object.
-if hasattr(encoder, "vocab_file"):
-  print("Wordpiece vocab at", encoder.vocab_file.asset_path.numpy())
-elif hasattr(encoder, "sp_model_file"):
-  print("SentencePiece model at", encoder.sp_model_file.asset_path.numpy())
-# Gets the value of a scalar bool tf.Variable.
-print("...using do_lower_case =", encoder.do_lower_case.numpy())
-```
-
-New users are encouraged to ignore these attributes and use the preprocessing
-model instead. However, there are legacy users, and advanced users that require
-access to the full vocab.
-
-### Exporting the Encoder with a Masked Language Model head
-
-To export an encoder and the masked language model it was trained with, first
-read the preceding section about exporting just the encoder. All the
-explanations there on setting the right flags apply here as well, up to the
-following differences.
-
-The masked language model is added to the export by changing flag
-`--export_type` from `model` to `model_with_mlm`, so the export command looks
-like this:
-
-```shell
-python official/nlp/tools/export_tfhub.py \
-  --encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
-  --model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
-  --vocab_file=${BERT_DIR:?}/vocab.txt \
-  --export_type=model_with_mlm \
-  --export_path=/tmp/bert_model
-```
-
-The `--model_checkpoint_path` refers to an object-based (TF2) checkpoint written
-by
-[BertPretrainerV2](https://github.com/tensorflow/models/search?q=BertPretrainerV2+filename%3Abert_pretrainer.py),
-or any other checkpoint that can be restored to
-`tf.train.Checkpoint(**BertPretrainerV2(...).checkpoint_items)` with the encoder
-defined by the config flags.
-
-This is a more comprehensive requirement on the checkpoint than for
-`--export_type=model`; not all Transformer encoders and not all pre-training
-techniques can satisfy it. For example,
-[ELECTRA](https://arxiv.org/abs/2003.10555) uses the BERT architecture but is
-pre-trained without an MLM task.
-
-The root object of the exported SavedModel is called in the same way as above.
-In addition, the SavedModel has an `mlm` subobject that can be called as follows
-to output an `mlm_logits` tensor as well:
-
-```python
-mlm_inputs = dict(
-    input_word_ids=...,       # Shape [batch, seq_length], dtype=int32
-    input_mask=...,           # Shape [batch, seq_length], dtype=int32
-    input_type_ids=...,       # Shape [batch, seq_length], dtype=int32
-    masked_lm_positions=...,  # Shape [batch, num_predictions], dtype=int32
-)
-mlm_outputs = encoder.mlm(mlm_inputs)
-assert mlm_outputs.keys() == {
-  "pooled_output",   # Shape [batch, width], dtype=float32
-  "sequence_output", # Shape [batch, seq_length, width], dtype=float32
-  "encoder_outputs", # List of Tensors with outputs of all transformer layers
-  "mlm_logits"       # Shape [batch, num_predictions, vocab_size], dtype=float32
-}
-```
-
-The extra subobject imposes a moderate size overhead.
-
-### Exporting from a TF1 BERT checkpoint
-
-A BERT model trained with the
-[original BERT implementation for TF1](https://github.com/google-research/bert)
-can be exported after converting its checkpoint with the
-[tf2_encoder_checkpoint_converter](https://github.com/tensorflow/models/blob/master/official/nlp/bert/tf2_encoder_checkpoint_converter.py)
-tool.
-
-After that, run
-[export_tfhub](https://github.com/tensorflow/models/blob/master/official/nlp/tools/export_tfhub.py)
-per the instructions above on the converted checkpoint. Do not set
-`--copy_pooler_dense_to_encoder`, because the pooler layer is part of the
-converted encoder. For `--vocab_file` and `--do_lower_case`, the values from TF1
-BERT can be used verbatim.
-
-## Exporting the preprocessing model
-
-You can skip this step if TF Hub already has a preprocessing model that does
-exactly what your encoder needs (same tokenizer, same vocab, same normalization
-settings (`do_lower_case`)). You can inspect its collection of
-[Transformer Encoders for Text](https://tfhub.dev/google/collections/transformer_encoders_text/1)
-and click through to models with a similar input domain to find their
-preprocessing models.
-
-To export the preprocessing model, set `--export_type=preprocessing` and run the
-export tool like this:
-
-```shell
-python official/nlp/tools/export_tfhub.py \
-  --vocab_file=${BERT_DIR:?}/vocab.txt \
-  --do_lower_case=True \
-  --export_type=preprocessing \
-  --export_path=/tmp/bert_preprocessing
-```
-
-Note: Set flag `--experimental_disable_assert_in_preprocessing` when exporting
-to users of the public TensorFlow releases 2.4.x to avoid a fatal ops placement
-issue when preprocessing is used within Dataset.map() on TPU workers.
-This is not an issue with TF2.3 and TF2.5+.
-
-Flag `--vocab_file` specifies the vocab file used with
-[BertTokenizer](https://github.com/tensorflow/models/search?q=BertTokenizer+filename%3Atext_layers.py).
-For models that use the
-[SentencepieceTokenizer](https://github.com/tensorflow/models/search?q=SentencepieceTokenizer+filename%3Atext_layers.py),
-set flag `--sp_model_file` instead.
-
-The boolean flag `--do_lower_case` controls text normalization (as in the
-respective tokenizer classes, so it's a bit more than just smashing case). If
-unset, do_lower_case will be enabled if 'uncased' appears in --vocab_file, or
-unconditionally if --sp_model_file is set, mimicking the conventions of BERT and
-ALBERT, respectively. For programmatic use, or if in doubt, it's best to set
-`--do_lower_case` explicity.
-
-If the definition of preprocessing involved
-[GIN](https://github.com/google/gin-config),
-the flags `--gin_file` and `--gin_params` would have to be set accordingly,
-consistent with pre-training. (At the time of this writing, no such GIN
-configurables exist in the code.)
-
-The exported SavedModel can be called in the following way for a single segment
-input.
-
-```python
-preprocessor = hub.load(...)
-text_input = ... # Shape [batch_size], dtype=tf.string
-encoder_inputs = preprocessor(text_input, seq_length=seq_length)
-assert encoder_inputs.keys() == {
-  "input_word_ids", # Shape [batch_size, seq_length], dtype=int32
-  "input_mask",     # Shape [batch_size, seq_length], dtype=int32
-  "input_type_ids"  # Shape [batch_size, seq_length], dtype=int32
-}
-```
-
-Flag `--default_seq_length` controls the value of `seq_length` if that argument
-is omitted in the usage example above. The flag defaults to 128, because
-mutiples of 128 work best for Cloud TPUs, yet the cost of attention computation
-grows quadratically with `seq_length`.
-
-Beyond this example, the exported SavedModel implements the full set interface
-from the preprocessor API for text embeddings with preprocessed inputs and with
-Transformer encoders from TF Hub's
-[Common APIs for text](https://www.tensorflow.org/hub/common_saved_model_apis/text).
-
-Please see
-[tfhub.dev/tensorflow/bert_en_uncased_preprocess](https://tfhub.dev/tensorflow/bert_en_uncased_preprocess)
-for the full documentation of one preprocessing model exported with this tool,
-especially how custom trimming of inputs can happen between `.tokenize` and
-`.bert_pack_inputs`.
-
-Using the `encoder.mlm()` interface requires masking of tokenized inputs by user
-code. The necessary information on the vocabulary encapsulated in the
-preprocessing model can be obtained like this (uniformly across tokenizers):
-
-```python
-special_tokens_dict = preprocess.tokenize.get_special_tokens_dict()
-vocab_size = int(special_tokens_dict["vocab_size"])
-padding_id = int(special_tokens_dict["padding_id"])  # [PAD] or <pad>
-start_of_sequence_id = int(special_tokens_dict["start_of_sequence_id"])  # [CLS]
-end_of_segment_id = int(special_tokens_dict["end_of_segment_id"])  # [SEP]
-mask_id = int(special_tokens_dict["mask_id"])  # [MASK]
-```
-
-## Testing the exported models
-
-Please test your SavedModels before publication by fine-tuning them on a
-suitable task and comparing performance and accuracy to a baseline experiment
-built from equivalent Python code.
-The
-[trainer doc](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
-has instructions how to run BERT on MNLI and other tasks from the GLUE
-benchmark.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/docs/train.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/docs/train.md
deleted file mode 100644
index d2ad9d7622d64a51cda7ba21e60e91779d2c243c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/docs/train.md
+++ /dev/null
@@ -1,181 +0,0 @@
-# Model Garden NLP Common Training Driver
-
-[train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py) is the common training driver that supports multiple
-NLP tasks (e.g., pre-training, GLUE and SQuAD fine-tuning etc) and multiple
-models (e.g., BERT, ALBERT, MobileBERT etc).
-
-## Experiment Configuration
-
-[train.py] is driven by configs defined by the [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py)
-including configurations for `task`, `trainer` and `runtime`. The pre-defined
-NLP related [ExperimentConfig](https://github.com/tensorflow/models/blob/master/official/core/config_definitions.py) can be found in
-[configs/experiment_configs.py](https://github.com/tensorflow/models/blob/master/official/nlp/configs/experiment_configs.py).
-
-## Experiment Registry
-
-We use an [experiment registry](https://github.com/tensorflow/models/blob/master/official/core/exp_factory.py) to build a mapping
-between experiment type to experiment configuration instance. For example,
-[configs/finetuning_experiments.py](https://github.com/tensorflow/models/blob/master/official/nlp/configs/finetuning_experiments.py)
-registers `bert/sentence_prediction` and `bert/squad` experiments. User can use
-`--experiment` FLAG to invoke a registered experiment configuration,
-e.g., `--experiment=bert/sentence_prediction`.
-
-## Overriding Configuration via Yaml and FLAGS
-
-The registered experiment configuration can be overridden by one or
-multiple Yaml files provided by `--config_file` FLAG. For example:
-
-```shell
---config_file=configs/experiments/glue_mnli_matched.yaml \
---config_file=configs/models/bert_en_uncased_base.yaml
-```
-
-In addition, experiment configuration can be further overriden by
-`params_override` FLAG. For example:
-
-```shell
- --params_override=task.train_data.input_path=/some/path,task.hub_module_url=/some/tfhub
-```
-
-## Run on Cloud TPUs
-
-Next, we will describe how to run the [train.py](https://github.com/tensorflow/models/blob/master/official/nlp/train.py) on Cloud TPUs.
-
-### Setup
-First, you need to create a `tf-nightly` TPU with
-[ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
-
-```shell
-export TPU_NAME=YOUR_TPU_NAME
-ctpu up -name $TPU_NAME --tf-version=nightly --tpu-size=YOUR_TPU_SIZE --project=YOUR_PROJECT
-```
-
-and then install Model Garden and required dependencies:
-
-```shell
-git clone https://github.com/tensorflow/models.git
-export PYTHONPATH=$PYTHONPATH:/path/to/models
-pip3 install --user -r official/requirements.txt
-```
-
-### Fine-tuning Sentence Classification with BERT from TF-Hub
-
-This example fine-tunes BERT-base from TF-Hub on the the Multi-Genre Natural
-Language Inference (MultiNLI) corpus using TPUs.
-
-Firstly, you can prepare the fine-tuning data using
-[`create_finetuning_data.py`](https://github.com/tensorflow/models/blob/master/official/nlp/data/create_finetuning_data.py) script.
-For GLUE tasks, you can (1) download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`, (2) prepare the vocabulary file,
-and (3) run the following command:
-
-```shell
-export GLUE_DIR=~/glue
-export VOCAB_FILE=~/uncased_L-12_H-768_A-12/vocab.txt
-
-export TASK_NAME=MNLI
-export OUTPUT_DATA_DIR=gs://some_bucket/datasets
-python3 data/create_finetuning_data.py \
- --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
- --vocab_file=${VOCAB_FILE} \
- --train_data_output_path=${OUTPUT_DATA_DIR}/${TASK_NAME}_train.tf_record \
- --eval_data_output_path=${OUTPUT_DATA_DIR}/${TASK_NAME}_eval.tf_record \
- --meta_data_file_path=${OUTPUT_DATA_DIR}/${TASK_NAME}_meta_data \
- --fine_tuning_task_type=classification --max_seq_length=128 \
- --classification_task_name=${TASK_NAME}
-```
-
-Resulting training and evaluation datasets in `tf_record` format will be later
-passed to [train.py](train.py). We will support to read dataset from
-tensorflow_datasets (TFDS) and use tf.text for pre-processing soon.
-
-Then you can execute the following commands to start the training and evaluation
-job.
-
-```shell
-export INPUT_DATA_DIR=gs://some_bucket/datasets
-export OUTPUT_DIR=gs://some_bucket/my_output_dir
-
-# See tfhub BERT collection for more tfhub models:
-# https://tfhub.dev/google/collections/bert/1
-export BERT_HUB_URL=https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3
-
-# Override the configurations by FLAGS. Alternatively, you can directly edit
-# `configs/experiments/glue_mnli_matched.yaml` to specify corresponding fields.
-export PARAMS=task.train_data.input_path=$INPUT_DATA_DIR/mnli_train.tf_record
-export PARAMS=$PARAMS,task.validation_data.input_path=$INPUT_DATA_DIR/mnli_eval.tf_record
-export PARAMS=$PARAMS,task.hub_module_url=$BERT_HUB_URL
-export PARAMS=$PARAMS,runtime.distribution_strategy=tpu
-
-python3 train.py \
- --experiment=bert/sentence_prediction \
- --mode=train_and_eval \
- --model_dir=$OUTPUT_DIR \
- --config_file=configs/experiments/glue_mnli_matched.yaml \
- --tfhub_cache_dir=$OUTPUT_DIR/hub_cache \
- --tpu=${TPU_NAME} \
- --params_override=$PARAMS
-
-```
-
-You can monitor the training progress in the console and find the output
-models in `$OUTPUT_DIR`.
-
-### Fine-tuning SQuAD with a pre-trained BERT checkpoint
-
-This example fine-tunes a pre-trained BERT checkpoint on the
-Stanford Question Answering Dataset (SQuAD) using TPUs.
-The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
-detailed information about the SQuAD datasets and evaluation. After downloading
-the SQuAD datasets and the [pre-trained BERT checkpoints](https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md),
-you can run the following command to prepare the `tf_record` files:
-
-```shell
-export SQUAD_DIR=~/squad
-export BERT_DIR=~/uncased_L-12_H-768_A-12
-export OUTPUT_DATA_DIR=gs://some_bucket/datasets
-
-python3 create_finetuning_data.py \
- --squad_data_file=${SQUAD_DIR}/train-v1.1.json \
- --vocab_file=${BERT_DIR}/vocab.txt \
- --train_data_output_path=${OUTPUT_DATA_DIR}/train.tf_record \
- --meta_data_file_path=${OUTPUT_DATA_DIR}/squad_meta_data \
- --fine_tuning_task_type=squad --max_seq_length=384
-```
-
-Note: To create fine-tuning data with SQuAD 2.0, you need to add flag `--version_2_with_negative=True`.
-
-Then, you can start the training and evaluation jobs:
-
-```shell
-export SQUAD_DIR=~/squad
-export INPUT_DATA_DIR=gs://some_bucket/datasets
-export OUTPUT_DIR=gs://some_bucket/my_output_dir
-
-# See the following link for more pre-trained checkpoints:
-# https://github.com/tensorflow/models/blob/master/official/nlp/docs/pretrained_models.md
-export BERT_DIR=~/uncased_L-12_H-768_A-12
-
-# Override the configurations by FLAGS. Alternatively, you can directly edit
-# `configs/experiments/squad_v1.1.yaml` to specify corresponding fields.
-# Also note that the training data is the pre-processed tf_record file, while
-# the validation file is the raw json file.
-export PARAMS=task.train_data.input_path=$INPUT_DATA_DIR/train.tf_record
-export PARAMS=$PARAMS,task.validation_data.input_path=$SQUAD_DIR/dev-v1.1.json
-export PARAMS=$PARAMS,task.validation_data.vocab_file=$BERT_DIR/vocab.txt
-export PARAMS=$PARAMS,task.init_checkpoint=$BERT_DIR/bert_model.ckpt
-export PARAMS=$PARAMS,runtime.distribution_strategy=tpu
-
-python3 train.py \
- --experiment=bert/squad \
- --mode=train_and_eval \
- --model_dir=$OUTPUT_DIR \
- --config_file=configs/experiments/squad_v1.1.yaml \
- --tpu=${TPU_NAME} \
- --params_override=$PARAMS
-
-```
-
-Note: More examples about pre-training will come soon.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/finetuning/binary_helper.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/finetuning/binary_helper.py
deleted file mode 100644
index cdeee4126b3d5e9ec1d91f46191849ba74f28011..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/finetuning/binary_helper.py
+++ /dev/null
@@ -1,419 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""The helper for finetuning binaries."""
-import json
-import math
-import sys
-from typing import Any, Dict, List, Optional
-
-from absl import logging
-import tensorflow as tf
-
-from official.core import config_definitions as cfg
-from official.modeling import hyperparams
-from official.nlp.configs import encoders
-from official.nlp.data import question_answering_dataloader
-from official.nlp.data import sentence_prediction_dataloader
-from official.nlp.data import tagging_dataloader
-from official.nlp.tasks import question_answering
-from official.nlp.tasks import sentence_prediction
-from official.nlp.tasks import tagging
-
-
-def override_trainer_cfg(trainer_cfg: cfg.TrainerConfig, learning_rate: float,
-                         num_epoch: int, global_batch_size: int,
-                         warmup_ratio: float, training_data_size: int,
-                         eval_data_size: int, num_eval_per_epoch: int,
-                         best_checkpoint_export_subdir: str,
-                         best_checkpoint_eval_metric: str,
-                         best_checkpoint_metric_comp: str):
-  """Overrides a `cfg.TrainerConfig` object."""
-  steps_per_epoch = training_data_size // global_batch_size
-  train_steps = steps_per_epoch * num_epoch
-  # TODO(b/165081095): always set to -1 after the bug is resolved.
-  if eval_data_size:
-    eval_steps = int(math.ceil(eval_data_size / global_batch_size))
-  else:
-    eval_steps = -1  # exhaust the validation data.
-  warmp_steps = int(train_steps * warmup_ratio)
-  validation_interval = steps_per_epoch // num_eval_per_epoch
-  trainer_cfg.override({
-      'optimizer_config': {
-          'learning_rate': {
-              'type': 'polynomial',
-              'polynomial': {
-                  'decay_steps': train_steps,
-                  'initial_learning_rate': learning_rate,
-                  'end_learning_rate': 0,
-              }
-          },
-          'optimizer': {
-              'type': 'adamw',
-          },
-          'warmup': {
-              'polynomial': {
-                  'warmup_steps': warmp_steps,
-              },
-              'type': 'polynomial',
-          },
-      },
-      'train_steps': train_steps,
-      'validation_interval': validation_interval,
-      'validation_steps': eval_steps,
-      'best_checkpoint_export_subdir': best_checkpoint_export_subdir,
-      'best_checkpoint_eval_metric': best_checkpoint_eval_metric,
-      'best_checkpoint_metric_comp': best_checkpoint_metric_comp,
-  })
-
-
-def load_model_config_file(model_config_file: str) -> Dict[str, Any]:
-  """Loads bert config json file or `encoders.EncoderConfig` in yaml file."""
-  if not model_config_file:
-    # model_config_file may be empty when using tf.hub.
-    return {}
-
-  try:
-    encoder_config = encoders.EncoderConfig()
-    encoder_config = hyperparams.override_params_dict(
-        encoder_config, model_config_file, is_strict=True)
-    logging.info('Load encoder_config yaml file from %s.', model_config_file)
-    return encoder_config.as_dict()
-  except KeyError:
-    pass
-
-  logging.info('Load bert config json file from %s', model_config_file)
-  with tf.io.gfile.GFile(model_config_file, 'r') as reader:
-    text = reader.read()
-    config = json.loads(text)
-
-  def get_value(key1, key2):
-    if key1 in config and key2 in config:
-      raise ValueError('Unexpected that both %s and %s are in config.' %
-                       (key1, key2))
-
-    return config[key1] if key1 in config else config[key2]
-
-  def get_value_or_none(key):
-    return config[key] if key in config else None
-
-  # Support both legacy bert_config attributes and the new config attributes.
-  return {
-      'bert': {
-          'attention_dropout_rate':
-              get_value('attention_dropout_rate',
-                        'attention_probs_dropout_prob'),
-          'dropout_rate':
-              get_value('dropout_rate', 'hidden_dropout_prob'),
-          'hidden_activation':
-              get_value('hidden_activation', 'hidden_act'),
-          'hidden_size':
-              config['hidden_size'],
-          'embedding_size':
-              get_value_or_none('embedding_size'),
-          'initializer_range':
-              config['initializer_range'],
-          'intermediate_size':
-              config['intermediate_size'],
-          'max_position_embeddings':
-              config['max_position_embeddings'],
-          'num_attention_heads':
-              config['num_attention_heads'],
-          'num_layers':
-              get_value('num_layers', 'num_hidden_layers'),
-          'type_vocab_size':
-              config['type_vocab_size'],
-          'vocab_size':
-              config['vocab_size'],
-      }
-  }
-
-
-def override_sentence_prediction_task_config(
-    task_cfg: sentence_prediction.SentencePredictionConfig,
-    model_config_file: str,
-    init_checkpoint: str,
-    hub_module_url: str,
-    global_batch_size: int,
-    train_input_path: str,
-    validation_input_path: str,
-    seq_length: int,
-    num_classes: int,
-    metric_type: Optional[str] = 'accuracy',
-    label_type: Optional[str] = 'int'):
-  """Overrides a `SentencePredictionConfig` object."""
-  task_cfg.override({
-      'init_checkpoint': init_checkpoint,
-      'metric_type': metric_type,
-      'model': {
-          'num_classes': num_classes,
-          'encoder': load_model_config_file(model_config_file),
-      },
-      'hub_module_url': hub_module_url,
-      'train_data': {
-          'drop_remainder': True,
-          'global_batch_size': global_batch_size,
-          'input_path': train_input_path,
-          'is_training': True,
-          'seq_length': seq_length,
-          'label_type': label_type,
-      },
-      'validation_data': {
-          'drop_remainder': False,
-          'global_batch_size': global_batch_size,
-          'input_path': validation_input_path,
-          'is_training': False,
-          'seq_length': seq_length,
-          'label_type': label_type,
-      }
-  })
-
-
-def override_qa_task_config(
-    task_cfg: question_answering.QuestionAnsweringConfig,
-    model_config_file: str, init_checkpoint: str, hub_module_url: str,
-    global_batch_size: int, train_input_path: str, validation_input_path: str,
-    seq_length: int, tokenization: str, vocab_file: str, do_lower_case: bool,
-    version_2_with_negative: bool):
-  """Overrides a `QuestionAnsweringConfig` object."""
-  task_cfg.override({
-      'init_checkpoint': init_checkpoint,
-      'model': {
-          'encoder': load_model_config_file(model_config_file),
-      },
-      'hub_module_url': hub_module_url,
-      'train_data': {
-          'drop_remainder': True,
-          'global_batch_size': global_batch_size,
-          'input_path': train_input_path,
-          'is_training': True,
-          'seq_length': seq_length,
-      },
-      'validation_data': {
-          'do_lower_case': do_lower_case,
-          'drop_remainder': False,
-          'global_batch_size': global_batch_size,
-          'input_path': validation_input_path,
-          'is_training': False,
-          'seq_length': seq_length,
-          'tokenization': tokenization,
-          'version_2_with_negative': version_2_with_negative,
-          'vocab_file': vocab_file,
-      }
-  })
-
-
-def override_tagging_task_config(task_cfg: tagging.TaggingConfig,
-                                 model_config_file: str, init_checkpoint: str,
-                                 hub_module_url: str, global_batch_size: int,
-                                 train_input_path: str,
-                                 validation_input_path: str, seq_length: int,
-                                 class_names: List[str]):
-  """Overrides a `TaggingConfig` object."""
-  task_cfg.override({
-      'init_checkpoint': init_checkpoint,
-      'model': {
-          'encoder': load_model_config_file(model_config_file),
-      },
-      'hub_module_url': hub_module_url,
-      'train_data': {
-          'drop_remainder': True,
-          'global_batch_size': global_batch_size,
-          'input_path': train_input_path,
-          'is_training': True,
-          'seq_length': seq_length,
-      },
-      'validation_data': {
-          'drop_remainder': False,
-          'global_batch_size': global_batch_size,
-          'input_path': validation_input_path,
-          'is_training': False,
-          'seq_length': seq_length,
-      },
-      'class_names': class_names,
-  })
-
-
-def write_glue_classification(task,
-                              model,
-                              input_file,
-                              output_file,
-                              predict_batch_size,
-                              seq_length,
-                              class_names,
-                              label_type='int',
-                              min_float_value=None,
-                              max_float_value=None):
-  """Makes classification predictions for glue and writes to output file.
-
-  Args:
-    task: `Task` instance.
-    model: `keras.Model` instance.
-    input_file: Input test data file path.
-    output_file: Output test data file path.
-    predict_batch_size: Batch size for prediction.
-    seq_length: Input sequence length.
-    class_names: List of string class names.
-    label_type: String denoting label type ('int', 'float'), defaults to 'int'.
-    min_float_value: If set, predictions will be min-clipped to this value (only
-      for regression when `label_type` is set to 'float'). Defaults to `None`
-      (no clipping).
-    max_float_value: If set, predictions will be max-clipped to this value (only
-      for regression when `label_type` is set to 'float'). Defaults to `None`
-      (no clipping).
-  """
-  if label_type not in ('int', 'float'):
-    raise ValueError('Unsupported `label_type`. Given: %s, expected `int` or '
-                     '`float`.' % label_type)
-
-  data_config = sentence_prediction_dataloader.SentencePredictionDataConfig(
-      input_path=input_file,
-      global_batch_size=predict_batch_size,
-      is_training=False,
-      seq_length=seq_length,
-      label_type=label_type,
-      drop_remainder=False,
-      include_example_id=True)
-  predictions = sentence_prediction.predict(task, data_config, model)
-
-  if label_type == 'float':
-    min_float_value = (-sys.float_info.max
-                       if min_float_value is None else min_float_value)
-    max_float_value = (
-        sys.float_info.max if max_float_value is None else max_float_value)
-
-    # Clip predictions to range [min_float_value, max_float_value].
-    predictions = [
-        min(max(prediction, min_float_value), max_float_value)
-        for prediction in predictions
-    ]
-
-  with tf.io.gfile.GFile(output_file, 'w') as writer:
-    writer.write('index\tprediction\n')
-    for index, prediction in enumerate(predictions):
-      if label_type == 'float':
-        # Regression.
-        writer.write('%d\t%.3f\n' % (index, prediction))
-      else:
-        # Classification.
-        writer.write('%d\t%s\n' % (index, class_names[prediction]))
-
-
-def write_xtreme_classification(task,
-                                model,
-                                input_file,
-                                output_file,
-                                predict_batch_size,
-                                seq_length,
-                                class_names,
-                                translated_input_file=None,
-                                test_time_aug_wgt=0.3):
-  """Makes classification predictions for xtreme and writes to output file."""
-  data_config = sentence_prediction_dataloader.SentencePredictionDataConfig(
-      input_path=input_file,
-      seq_length=seq_length,
-      is_training=False,
-      label_type='int',
-      global_batch_size=predict_batch_size,
-      drop_remainder=False,
-      include_example_id=True)
-  if translated_input_file is not None:
-    data_config_aug = (
-        sentence_prediction_dataloader.SentencePredictionDataConfig(
-            input_path=translated_input_file,
-            seq_length=seq_length,
-            is_training=False,
-            label_type='int',
-            global_batch_size=predict_batch_size,
-            drop_remainder=False,
-            include_example_id=True))
-  else:
-    data_config_aug = None
-  predictions = sentence_prediction.predict(task, data_config, model,
-                                            data_config_aug, test_time_aug_wgt)
-  with tf.io.gfile.GFile(output_file, 'w') as writer:
-    for prediction in predictions:
-      writer.write('%s\n' % class_names[prediction])
-
-
-def write_question_answering(task,
-                             model,
-                             input_file,
-                             output_file,
-                             predict_batch_size,
-                             seq_length,
-                             tokenization,
-                             vocab_file,
-                             do_lower_case,
-                             version_2_with_negative=False):
-  """Makes question answering predictions and writes to output file."""
-  data_config = question_answering_dataloader.QADataConfig(
-      do_lower_case=do_lower_case,
-      doc_stride=128,
-      drop_remainder=False,
-      global_batch_size=predict_batch_size,
-      input_path=input_file,
-      is_training=False,
-      query_length=64,
-      seq_length=seq_length,
-      tokenization=tokenization,
-      version_2_with_negative=version_2_with_negative,
-      vocab_file=vocab_file)
-  all_predictions, _, _ = question_answering.predict(task, data_config, model)
-  with tf.io.gfile.GFile(output_file, 'w') as writer:
-    writer.write(json.dumps(all_predictions, indent=4) + '\n')
-
-
-def write_tagging(task, model, input_file, output_file, predict_batch_size,
-                  seq_length):
-  """Makes tagging predictions and writes to output file."""
-  data_config = tagging_dataloader.TaggingDataConfig(
-      input_path=input_file,
-      is_training=False,
-      seq_length=seq_length,
-      global_batch_size=predict_batch_size,
-      drop_remainder=False,
-      include_sentence_id=True)
-  results = tagging.predict(task, data_config, model)
-  class_names = task.task_config.class_names
-  last_sentence_id = -1
-
-  with tf.io.gfile.GFile(output_file, 'w') as writer:
-    for sentence_id, _, predict_ids in results:
-      token_labels = [class_names[x] for x in predict_ids]
-      assert sentence_id == last_sentence_id or (
-          sentence_id == last_sentence_id + 1)
-
-      if sentence_id != last_sentence_id and last_sentence_id != -1:
-        writer.write('\n')
-
-      writer.write('\n'.join(token_labels))
-      writer.write('\n')
-      last_sentence_id = sentence_id
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/finetuning/glue/flags.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/finetuning/glue/flags.py
deleted file mode 100644
index 56ef2f19de4c880e7355f3f6aeec99cde4354c29..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/finetuning/glue/flags.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Common flags for GLUE finetuning binary."""
-from typing import Callable
-
-from absl import flags
-from absl import logging
-
-
-def define_flags():
-  """Defines flags."""
-
-  # ===========================================================================
-  # Glue binary flags.
-  # ===========================================================================
-  flags.DEFINE_enum(
-      'mode', 'train_eval_and_predict',
-      ['train_eval_and_predict', 'train_eval', 'predict'],
-      'The mode to run the binary. If `train_eval_and_predict` '
-      'it will (1) train on the training data and (2) evaluate on '
-      'the validation data and (3) finally generate predictions '
-      'on the prediction data; if `train_eval`, it will only '
-      'run training and evaluation; if `predict`, it will only '
-      'run prediction using the model in `model_dir`.')
-
-  flags.DEFINE_enum('task_name', None, [
-      'AX', 'COLA', 'MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2', 'STS-B',
-      'WNLI'
-  ], 'The type of GLUE task.')
-
-  flags.DEFINE_string('train_input_path', None,
-                      'The file path to the training data.')
-
-  flags.DEFINE_string('validation_input_path', None,
-                      'The file path to the evaluation data.')
-
-  flags.DEFINE_string('test_input_path', None,
-                      'The file path to the test input data.')
-
-  flags.DEFINE_string('test_output_path', None,
-                      'The file path to the test output data.')
-
-  flags.DEFINE_string('model_dir', '', 'The model directory containing '
-                      'subdirectories for each task. Only needed for "predict" '
-                      'mode. For all other modes, if not provided, a unique '
-                      'directory will be created automatically for each run.')
-
-  flags.DEFINE_string(
-      'input_meta_data_path', None, 'Path to file that contains '
-      'metadata about input file. It is output by the `create_finetuning_data` '
-      'binary. Required for all modes except "predict".')
-
-  flags.DEFINE_string('init_checkpoint', '',
-                      'Initial checkpoint from a pre-trained BERT model.')
-
-  flags.DEFINE_string(
-      'model_config_file', '', 'The config file specifying the architecture '
-      'of the pre-trained model. The file can be either a bert_config.json '
-      'file or `encoders.EncoderConfig` in yaml file.')
-
-  flags.DEFINE_string(
-      'hub_module_url', '', 'TF-Hub path/url to a pretrained model. If '
-      'specified, `init_checkpoint` and `model_config_file` flag should not be '
-      'used.')
-
-  flags.DEFINE_multi_string('gin_file', None,
-                            'List of paths to the gin config files.')
-
-  flags.DEFINE_multi_string('gin_params', None,
-                            'Newline separated list of gin parameter bindings.')
-
-  flags.DEFINE_multi_string(
-      'config_file', None, 'This is the advanced usage to specify the '
-      '`ExperimentConfig` directly. When specified, '
-      'we will ignore FLAGS related to `ExperimentConfig` such as '
-      '`train_input_path`, `validation_input_path` and following hparams.')
-
-  # ===========================================================================
-  # Tuning hparams.
-  # ===========================================================================
-  flags.DEFINE_integer('global_batch_size', 32,
-                       'Global batch size for train/eval/predict.')
-
-  flags.DEFINE_float('learning_rate', 3e-5, 'Initial learning rate.')
-
-  flags.DEFINE_integer('num_epoch', 3, 'Number of training epochs.')
-
-  flags.DEFINE_float('warmup_ratio', 0.1,
-                     'Proportion of learning rate warmup steps.')
-
-  flags.DEFINE_integer('num_eval_per_epoch', 2,
-                       'Number of evaluations to run per epoch.')
-
-
-def validate_flags(flags_obj: flags.FlagValues,
-                   file_exists_fn: Callable[[str], bool]):
-  """Raises ValueError if any flags are misconfigured.
-
-  Args:
-    flags_obj: A `flags.FlagValues` object, usually from `flags.FLAG`.
-    file_exists_fn: A callable to decide if a file path exists or not.
-  """
-
-  def _check_path_exists(flag_path, flag_name):
-    if not file_exists_fn(flag_path):
-      raise ValueError('Flag `%s` at %s does not exist.' %
-                       (flag_name, flag_path))
-
-  def _validate_path(flag_path, flag_name):
-    if not flag_path:
-      raise ValueError('Flag `%s` must be provided in mode %s.' %
-                       (flag_name, flags_obj.mode))
-    _check_path_exists(flag_path, flag_name)
-
-  if 'train' in flags_obj.mode:
-    _validate_path(flags_obj.train_input_path, 'train_input_path')
-    _validate_path(flags_obj.input_meta_data_path, 'input_meta_data_path')
-
-    if flags_obj.gin_file:
-      for gin_file in flags_obj.gin_file:
-        _check_path_exists(gin_file, 'gin_file')
-    if flags_obj.config_file:
-      for config_file in flags_obj.config_file:
-        _check_path_exists(config_file, 'config_file')
-
-  if 'eval' in flags_obj.mode:
-    _validate_path(flags_obj.validation_input_path, 'validation_input_path')
-
-  if flags_obj.mode == 'predict':
-    # model_dir is only needed strictly in 'predict' mode.
-    _validate_path(flags_obj.model_dir, 'model_dir')
-
-  if 'predict' in flags_obj.mode:
-    _validate_path(flags_obj.test_input_path, 'test_input_path')
-
-  if not flags_obj.config_file and flags_obj.mode != 'predict':
-    if flags_obj.hub_module_url:
-      if flags_obj.init_checkpoint or flags_obj.model_config_file:
-        raise ValueError(
-            'When `hub_module_url` is specified, `init_checkpoint` and '
-            '`model_config_file` should be empty.')
-      logging.info(
-          'Using the pretrained tf.hub from %s', flags_obj.hub_module_url)
-    else:
-      if not (flags_obj.init_checkpoint and flags_obj.model_config_file):
-        raise ValueError('Both `init_checkpoint` and `model_config_file` '
-                         'should be specified if `config_file` is not '
-                         'specified.')
-      _validate_path(flags_obj.model_config_file, 'model_config_file')
-      logging.info(
-          'Using the pretrained checkpoint from %s and model_config_file from '
-          '%s.', flags_obj.init_checkpoint, flags_obj.model_config_file)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/finetuning/glue/run_glue.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/finetuning/glue/run_glue.py
deleted file mode 100644
index 39fa7909ea1b23fd0d4d273fd1e8a99a8b0aad6f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/finetuning/glue/run_glue.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Runs prediction to generate submission files for GLUE tasks."""
-import functools
-import json
-import os
-import pprint
-
-from absl import app
-from absl import flags
-from absl import logging
-
-import gin
-import tensorflow as tf
-
-from official.common import distribute_utils
-# Imports registered experiment configs.
-from official.common import registry_imports  # pylint: disable=unused-import
-from official.core import exp_factory
-from official.core import task_factory
-from official.core import train_lib
-from official.core import train_utils
-from official.modeling.hyperparams import params_dict
-from official.nlp.finetuning import binary_helper
-from official.nlp.finetuning.glue import flags as glue_flags
-
-
-# Device configs.
-flags.DEFINE_string('distribution_strategy', 'tpu',
-                    'The Distribution Strategy to use for training.')
-flags.DEFINE_string(
-    'tpu', '',
-    'The Cloud TPU to use for training. This should be either the name '
-    'used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 url.')
-flags.DEFINE_integer('num_gpus', 1, 'The number of GPUs to use at each worker.')
-
-FLAGS = flags.FLAGS
-
-EXPERIMENT_TYPE = 'bert/sentence_prediction'
-BEST_CHECKPOINT_EXPORT_SUBDIR = 'best_ckpt'
-
-EVAL_METRIC_MAP = {
-    'AX': 'matthews_corrcoef',
-    'COLA': 'matthews_corrcoef',
-    'MNLI': 'cls_accuracy',
-    'MRPC': 'cls_accuracy',
-    'QNLI': 'cls_accuracy',
-    'QQP': 'cls_accuracy',
-    'RTE': 'cls_accuracy',
-    'SST-2': 'cls_accuracy',
-    'STS-B': 'pearson_spearman_corr',
-    'WNLI': 'cls_accuracy',
-}
-
-AX_CLASS_NAMES = ['contradiction', 'entailment', 'neutral']
-COLA_CLASS_NAMES = ['0', '1']
-MNLI_CLASS_NAMES = ['contradiction', 'entailment', 'neutral']
-MRPC_CLASS_NAMES = ['0', '1']
-QNLI_CLASS_NAMES = ['entailment', 'not_entailment']
-QQP_CLASS_NAMES = ['0', '1']
-RTE_CLASS_NAMES = ['entailment', 'not_entailment']
-SST_2_CLASS_NAMES = ['0', '1']
-WNLI_CLASS_NAMES = ['0', '1']
-
-
-def _override_exp_config_by_file(exp_config, exp_config_files):
-  """Overrides an `ExperimentConfig` object by files."""
-  for exp_config_file in exp_config_files:
-    if not tf.io.gfile.exists(exp_config_file):
-      raise ValueError('%s does not exist.' % exp_config_file)
-    params_dict.override_params_dict(
-        exp_config, exp_config_file, is_strict=True)
-
-  return exp_config
-
-
-def _override_exp_config_by_flags(exp_config, input_meta_data):
-  """Overrides an `ExperimentConfig` object by flags."""
-  if FLAGS.task_name in ('AX', 'COLA',):
-    override_task_cfg_fn = functools.partial(
-        binary_helper.override_sentence_prediction_task_config,
-        num_classes=input_meta_data['num_labels'],
-        metric_type='matthews_corrcoef')
-  elif FLAGS.task_name in ('MNLI', 'MRPC', 'QNLI', 'QQP', 'RTE', 'SST-2',
-                           'WNLI'):
-    override_task_cfg_fn = functools.partial(
-        binary_helper.override_sentence_prediction_task_config,
-        num_classes=input_meta_data['num_labels'])
-  elif FLAGS.task_name in ('STS-B',):
-    override_task_cfg_fn = functools.partial(
-        binary_helper.override_sentence_prediction_task_config,
-        num_classes=1,
-        metric_type='pearson_spearman_corr',
-        label_type='float')
-  else:
-    raise ValueError('Task %s not supported.' % FLAGS.task_name)
-
-  binary_helper.override_trainer_cfg(
-      exp_config.trainer,
-      learning_rate=FLAGS.learning_rate,
-      num_epoch=FLAGS.num_epoch,
-      global_batch_size=FLAGS.global_batch_size,
-      warmup_ratio=FLAGS.warmup_ratio,
-      training_data_size=input_meta_data['train_data_size'],
-      eval_data_size=input_meta_data['eval_data_size'],
-      num_eval_per_epoch=FLAGS.num_eval_per_epoch,
-      best_checkpoint_export_subdir=BEST_CHECKPOINT_EXPORT_SUBDIR,
-      best_checkpoint_eval_metric=EVAL_METRIC_MAP[FLAGS.task_name],
-      best_checkpoint_metric_comp='higher')
-
-  override_task_cfg_fn(
-      exp_config.task,
-      model_config_file=FLAGS.model_config_file,
-      init_checkpoint=FLAGS.init_checkpoint,
-      hub_module_url=FLAGS.hub_module_url,
-      global_batch_size=FLAGS.global_batch_size,
-      train_input_path=FLAGS.train_input_path,
-      validation_input_path=FLAGS.validation_input_path,
-      seq_length=input_meta_data['max_seq_length'])
-  return exp_config
-
-
-def _get_exp_config(input_meta_data, exp_config_files):
-  """Gets an `ExperimentConfig` object."""
-  exp_config = exp_factory.get_exp_config(EXPERIMENT_TYPE)
-
-  if exp_config_files:
-    logging.info(
-        'Loading `ExperimentConfig` from file, and flags will be ignored.')
-    exp_config = _override_exp_config_by_file(exp_config, exp_config_files)
-  else:
-    logging.info('Loading `ExperimentConfig` from flags.')
-    exp_config = _override_exp_config_by_flags(exp_config, input_meta_data)
-
-  exp_config.validate()
-  exp_config.lock()
-
-  pp = pprint.PrettyPrinter()
-  logging.info('Final experiment parameters: %s',
-               pp.pformat(exp_config.as_dict()))
-
-  return exp_config
-
-
-def _write_submission_file(task, seq_length):
-  """Writes submission files that can be uploaded to the leaderboard."""
-  tf.io.gfile.makedirs(os.path.dirname(FLAGS.test_output_path))
-  model = task.build_model()
-
-  ckpt_file = tf.train.latest_checkpoint(
-      os.path.join(FLAGS.model_dir, BEST_CHECKPOINT_EXPORT_SUBDIR))
-  logging.info('Restoring checkpoints from %s', ckpt_file)
-  checkpoint = tf.train.Checkpoint(model=model)
-  checkpoint.read(ckpt_file).expect_partial()
-
-  write_fn = binary_helper.write_glue_classification
-  write_fn_map = {
-      'AX':
-          functools.partial(
-              write_fn, class_names=AX_CLASS_NAMES),
-      'COLA':
-          functools.partial(
-              write_fn, class_names=COLA_CLASS_NAMES),
-      'MNLI':
-          functools.partial(
-              write_fn, class_names=MNLI_CLASS_NAMES),
-      'MRPC':
-          functools.partial(
-              write_fn, class_names=MRPC_CLASS_NAMES),
-      'QNLI':
-          functools.partial(
-              write_fn, class_names=QNLI_CLASS_NAMES),
-      'QQP':
-          functools.partial(
-              write_fn, class_names=QQP_CLASS_NAMES),
-      'RTE':
-          functools.partial(
-              write_fn, class_names=RTE_CLASS_NAMES),
-      'SST-2':
-          functools.partial(
-              write_fn, class_names=SST_2_CLASS_NAMES),
-      'STS-B':
-          # No class_names (regression), clip predictions to [0.0, 5.0] per glue
-          # benchmark grader.
-          functools.partial(
-              write_fn, class_names=None, label_type='float',
-              min_float_value=0.0, max_float_value=5.0),
-      'WNLI':
-          functools.partial(
-              write_fn, class_names=WNLI_CLASS_NAMES),
-  }
-  logging.info('Predicting %s', FLAGS.test_input_path)
-  write_fn_map[FLAGS.task_name](
-      task=task,
-      model=model,
-      input_file=FLAGS.test_input_path,
-      output_file=FLAGS.test_output_path,
-      predict_batch_size=(
-          task.task_config.train_data.global_batch_size),
-      seq_length=seq_length)
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError('Too many command-line arguments.')
-
-  glue_flags.validate_flags(FLAGS, file_exists_fn=tf.io.gfile.exists)
-
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
-  distribution_strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=FLAGS.distribution_strategy,
-      num_gpus=FLAGS.num_gpus,
-      tpu_address=FLAGS.tpu)
-
-  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-    input_meta_data = json.loads(reader.read().decode('utf-8'))
-
-  with distribution_strategy.scope():
-    task = None
-    if 'train_eval' in FLAGS.mode:
-      logging.info('Starting training and eval...')
-      logging.info('Model dir: %s', FLAGS.model_dir)
-
-      exp_config = _get_exp_config(
-          input_meta_data=input_meta_data,
-          exp_config_files=FLAGS.config_file)
-      train_utils.serialize_config(exp_config, FLAGS.model_dir)
-      task = task_factory.get_task(exp_config.task, logging_dir=FLAGS.model_dir)
-      train_lib.run_experiment(
-          distribution_strategy=distribution_strategy,
-          task=task,
-          mode='train_and_eval',
-          params=exp_config,
-          model_dir=FLAGS.model_dir)
-
-    if 'predict' in FLAGS.mode:
-      logging.info('Starting predict...')
-      # When mode is `predict`, `task` will be None.
-      if task is None:
-        exp_config = _get_exp_config(
-            input_meta_data=input_meta_data,
-            exp_config_files=[os.path.join(FLAGS.model_dir, 'params.yaml')])
-        task = task_factory.get_task(
-            exp_config.task, logging_dir=FLAGS.model_dir)
-      _write_submission_file(task, input_meta_data['max_seq_length'])
-
-
-if __name__ == '__main__':
-  glue_flags.define_flags()
-  flags.mark_flag_as_required('mode')
-  flags.mark_flag_as_required('task_name')
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/metrics/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/metrics/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/metrics/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/metrics/bleu.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/metrics/bleu.py
deleted file mode 100644
index 9fc76063f5dddf1d48bbb40a3ea392afba99bbf8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/metrics/bleu.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Script to compute official BLEU score.
-
-Source:
-https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py
-"""
-
-import collections
-import math
-import re
-import sys
-import unicodedata
-
-import numpy as np
-import tensorflow as tf
-
-
-class UnicodeRegex(object):
-  """Ad-hoc hack to recognize all punctuation and symbols."""
-
-  def __init__(self):
-    punctuation = self.property_chars("P")
-    self.nondigit_punct_re = re.compile(r"([^\d])([" + punctuation + r"])")
-    self.punct_nondigit_re = re.compile(r"([" + punctuation + r"])([^\d])")
-    self.symbol_re = re.compile("([" + self.property_chars("S") + "])")
-
-  def property_chars(self, prefix):
-    return "".join(
-        chr(x)
-        for x in range(sys.maxunicode)
-        if unicodedata.category(chr(x)).startswith(prefix))
-
-
-uregex = UnicodeRegex()
-
-
-def bleu_tokenize(string):
-  r"""Tokenize a string following the official BLEU implementation.
-
-  See https://github.com/moses-smt/mosesdecoder/'
-           'blob/master/scripts/generic/mteval-v14.pl#L954-L983
-  In our case, the input string is expected to be just one line
-  and no HTML entities de-escaping is needed.
-  So we just tokenize on punctuation and symbols,
-  except when a punctuation is preceded and followed by a digit
-  (e.g. a comma/dot as a thousand/decimal separator).
-
-  Note that a numer (e.g. a year) followed by a dot at the end of sentence
-  is NOT tokenized,
-  i.e. the dot stays with the number because `s/(\p{P})(\P{N})/ $1 $2/g`
-  does not match this case (unless we add a space after each sentence).
-  However, this error is already in the original mteval-v14.pl
-  and we want to be consistent with it.
-
-  Args:
-    string: the input string
-
-  Returns:
-    a list of tokens
-  """
-  string = uregex.nondigit_punct_re.sub(r"\1 \2 ", string)
-  string = uregex.punct_nondigit_re.sub(r" \1 \2", string)
-  string = uregex.symbol_re.sub(r" \1 ", string)
-  return string.split()
-
-
-def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
-  """Compute BLEU for two files (reference and hypothesis translation)."""
-  ref_lines = tf.io.gfile.GFile(ref_filename).read().strip().splitlines()
-  hyp_lines = tf.io.gfile.GFile(hyp_filename).read().strip().splitlines()
-  return bleu_on_list(ref_lines, hyp_lines, case_sensitive)
-
-
-def _get_ngrams_with_counter(segment, max_order):
-  """Extracts all n-grams up to a given maximum order from an input segment.
-
-  Args:
-    segment: text segment from which n-grams will be extracted.
-    max_order: maximum length in tokens of the n-grams returned by this
-        methods.
-
-  Returns:
-    The Counter containing all n-grams upto max_order in segment
-    with a count of how many times each n-gram occurred.
-  """
-  ngram_counts = collections.Counter()
-  for order in range(1, max_order + 1):
-    for i in range(0, len(segment) - order + 1):
-      ngram = tuple(segment[i:i + order])
-      ngram_counts[ngram] += 1
-  return ngram_counts
-
-
-def compute_bleu(reference_corpus, translation_corpus, max_order=4,
-                 use_bp=True):
-  """Computes BLEU score of translated segments against one or more references.
-
-  Args:
-    reference_corpus: list of references for each translation. Each
-        reference should be tokenized into a list of tokens.
-    translation_corpus: list of translations to score. Each translation
-        should be tokenized into a list of tokens.
-    max_order: Maximum n-gram order to use when computing BLEU score.
-    use_bp: boolean, whether to apply brevity penalty.
-
-  Returns:
-    BLEU score.
-  """
-  reference_length = 0
-  translation_length = 0
-  bp = 1.0
-  geo_mean = 0
-
-  matches_by_order = [0] * max_order
-  possible_matches_by_order = [0] * max_order
-  precisions = []
-
-  for (references, translations) in zip(reference_corpus, translation_corpus):
-    reference_length += len(references)
-    translation_length += len(translations)
-    ref_ngram_counts = _get_ngrams_with_counter(references, max_order)
-    translation_ngram_counts = _get_ngrams_with_counter(translations, max_order)
-
-    overlap = dict((ngram,
-                    min(count, translation_ngram_counts[ngram]))
-                   for ngram, count in ref_ngram_counts.items())
-
-    for ngram in overlap:
-      matches_by_order[len(ngram) - 1] += overlap[ngram]
-    for ngram in translation_ngram_counts:
-      possible_matches_by_order[len(ngram) - 1] += translation_ngram_counts[
-          ngram]
-
-  precisions = [0] * max_order
-  smooth = 1.0
-
-  for i in range(0, max_order):
-    if possible_matches_by_order[i] > 0:
-      precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[i]
-      if matches_by_order[i] > 0:
-        precisions[i] = float(matches_by_order[i]) / possible_matches_by_order[
-            i]
-      else:
-        smooth *= 2
-        precisions[i] = 1.0 / (smooth * possible_matches_by_order[i])
-    else:
-      precisions[i] = 0.0
-
-  if max(precisions) > 0:
-    p_log_sum = sum(math.log(p) for p in precisions if p)
-    geo_mean = math.exp(p_log_sum / max_order)
-
-  if use_bp:
-    ratio = translation_length / reference_length
-    bp = math.exp(1 - 1. / ratio) if ratio < 1.0 else 1.0
-  bleu = geo_mean * bp
-  return np.float32(bleu)
-
-
-def bleu_on_list(ref_lines, hyp_lines, case_sensitive=False):
-  """Compute BLEU for two list of strings (reference and hypothesis)."""
-  if len(ref_lines) != len(hyp_lines):
-    raise ValueError(
-        "Reference and translation files have different number of "
-        "lines (%d VS %d). If training only a few steps (100-200), the "
-        "translation may be empty." % (len(ref_lines), len(hyp_lines)))
-  if not case_sensitive:
-    ref_lines = [x.lower() for x in ref_lines]
-    hyp_lines = [x.lower() for x in hyp_lines]
-  ref_tokens = [bleu_tokenize(x) for x in ref_lines]
-  hyp_tokens = [bleu_tokenize(x) for x in hyp_lines]
-  return compute_bleu(ref_tokens, hyp_tokens) * 100
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/metrics/bleu_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/metrics/bleu_test.py
deleted file mode 100644
index a7058fc6dcb4e334d8e16b51773fc5c0d7176b05..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/metrics/bleu_test.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Test functions in compute_blue.py."""
-
-import tempfile
-
-import tensorflow as tf
-
-from official.nlp.metrics import bleu
-
-
-class ComputeBleuTest(tf.test.TestCase):
-
-  def _create_temp_file(self, text):
-    temp_file = tempfile.NamedTemporaryFile(delete=False)
-    with tf.io.gfile.GFile(temp_file.name, "w") as w:
-      w.write(text)
-    return temp_file.name
-
-  def test_bleu_same(self):
-    ref = self._create_temp_file("test 1 two 3\nmore tests!")
-    hyp = self._create_temp_file("test 1 two 3\nmore tests!")
-
-    uncased_score = bleu.bleu_wrapper(ref, hyp, False)
-    cased_score = bleu.bleu_wrapper(ref, hyp, True)
-    self.assertEqual(100, uncased_score)
-    self.assertEqual(100, cased_score)
-
-  def test_bleu_same_different_case(self):
-    ref = self._create_temp_file("Test 1 two 3\nmore tests!")
-    hyp = self._create_temp_file("test 1 two 3\nMore tests!")
-    uncased_score = bleu.bleu_wrapper(ref, hyp, False)
-    cased_score = bleu.bleu_wrapper(ref, hyp, True)
-    self.assertEqual(100, uncased_score)
-    self.assertLess(cased_score, 100)
-
-  def test_bleu_different(self):
-    ref = self._create_temp_file("Testing\nmore tests!")
-    hyp = self._create_temp_file("Dog\nCat")
-    uncased_score = bleu.bleu_wrapper(ref, hyp, False)
-    cased_score = bleu.bleu_wrapper(ref, hyp, True)
-    self.assertLess(uncased_score, 100)
-    self.assertLess(cased_score, 100)
-
-  def test_bleu_tokenize(self):
-    s = "Test0, 1 two, 3"
-    tokenized = bleu.bleu_tokenize(s)
-    self.assertEqual(["Test0", ",", "1", "two", ",", "3"], tokenized)
-
-  def test_bleu_list(self):
-    ref = ["test 1 two 3", "more tests!"]
-    hyp = ["test 1 two 3", "More tests!"]
-    uncased_score = bleu.bleu_on_list(ref, hyp, False)
-    cased_score = bleu.bleu_on_list(ref, hyp, True)
-    self.assertEqual(uncased_score, 100)
-    self.assertLess(cased_score, 100)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/README.md
deleted file mode 100644
index 99c7c361f9716b380a9287306558b872238afa7e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/README.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# NLP Modeling Library
-
-This library provides a set of Keras primitives (`tf.keras.Layer` and
-`tf.keras.Model`) that can be assembled into transformer-based models.
-They are flexible, validated, interoperable, and both TF1 and TF2 compatible.
-
-* [`layers`](layers) are the fundamental building blocks for NLP models.
-They can be used to assemble new `tf.keras` layers or models.
-
-* [`networks`](networks) are combinations of `tf.keras` layers (and possibly
-other networks). They are `tf.keras` models that would not be trained alone.
-It encapsulates common network structures like a transformer encoder into an
-easily handled object with a standardized configuration.
-
-* [`models`](models) are combinations of `tf.keras` layers and models that can
-be trained. Several pre-built canned models are provided to train encoder
-networks. These models are intended as both convenience functions and canonical
-examples.
-
-* [`losses`](losses) contains common loss computation used in NLP tasks.
-
-Please see the colab
-[nlp_modeling_library_intro.ipynb]
-(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb)
-for how to build transformer-based NLP models using above primitives.
-
-Besides the pre-defined primitives, it also provides scaffold classes to allow
-easy experimentation with noval achitectures, e.g., you don’t need to fork a
-whole Transformer object to try a different kind of attention primitive,
-for instance.
-
-* [`TransformerScaffold`](layers/transformer_scaffold.py) implements the
-Transformer from ["Attention Is All You Need"]
-(https://arxiv.org/abs/1706.03762), with a customizable attention layer
-option. Users can pass a class to `attention_cls` and associated config to
-`attention_cfg`, in which case the scaffold will instantiate the class with
-the config, or pass a class instance to `attention_cls`.
-
-* [`EncoderScaffold`](networks/encoder_scaffold.py) implements the transformer
-encoder from ["BERT: Pre-training of Deep Bidirectional Transformers for
-Language Understanding"](https://arxiv.org/abs/1810.04805), with customizable
-embedding subnetwork (which will replace the standard embedding logic) and/or a
-custom hidden layer (which will replace the Transformer instantiation in the
-encoder).
-
-Please see the colab
-[customize_encoder.ipynb]
-(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb)
-for how to use scaffold classes to build noval achitectures.
-
-BERT and ALBERT models in this repo are implemented using this library.
-Code examples can be found in the corresponding model folder.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/__init__.py
deleted file mode 100644
index 98363a997739d65311349f80d4a6fb4e1c978780..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""NLP Modeling Library.
-
-This library provides a set of Keras primitives (`tf.keras.Layer` and
-`tf.keras.Model`) that can be assembled into transformer-based models.
-They are flexible, validated, interoperable, and both TF1 and TF2 compatible.
-"""
-from official.nlp.modeling import layers
-from official.nlp.modeling import losses
-from official.nlp.modeling import models
-from official.nlp.modeling import networks
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/README.md
deleted file mode 100644
index 79e142a08874e8d4b72e9449e150b4e99dbf6c9f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/README.md
+++ /dev/null
@@ -1,123 +0,0 @@
-# Layers
-
-Layers are the fundamental building blocks for NLP models. They can be used to
-assemble new `tf.keras` layers or models.
-
-*   [MultiHeadAttention](attention.py) implements an optionally masked attention
-    between query, key, value tensors as described in
-    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762). If
-    `from_tensor` and `to_tensor` are the same, then this is self-attention.
-
-*   [BigBirdAttention](bigbird_attention.py) implements a sparse attention
-    mechanism that reduces this quadratic dependency to linear described in
-    ["Big Bird: Transformers for Longer Sequences"](https://arxiv.org/abs/2007.14062).
-
-*   [CachedAttention](attention.py) implements an attention layer with cache
-    used for auto-agressive decoding.
-
-*   [KernelAttention](kernel_attention.py) implements a group of attention
-    mechansim that express the self-attention as a linear dot-product of
-    kernel feature maps and make use of the associativity property of
-    matrix products to reduce the complexity from quadratic to linear. The
-    implementation includes methods described in ["Transformers are RNNs:
-    Fast Autoregressive Transformers with Linear Attention"](https://arxiv.org/abs/2006.16236),
-    ["Rethinking Attention with Performers"](https://arxiv.org/abs/2009.14794),
-    ["Random Feature Attention"](https://openreview.net/pdf?id=QtTKTdVrFBB).
-
-*   [MatMulWithMargin](mat_mul_with_margin.py) implements a matrix
-    multiplication with margin layer used for training retrieval / ranking
-    tasks, as described in ["Improving Multilingual Sentence Embedding using
-    Bi-directional Dual Encoder with Additive Margin
-    Softmax"](https://www.ijcai.org/Proceedings/2019/0746.pdf).
-
-*   [MultiChannelAttention](multi_channel_attention.py) implements an variant of
-    multi-head attention which can be used to merge multiple streams for
-    cross-attentions.
-
-*   [TalkingHeadsAttention](talking_heads_attention.py) implements the talking
-    heads attention, as decribed in
-    ["Talking-Heads Attention"](https://arxiv.org/abs/2003.02436).
-
-*   [Transformer](transformer.py) implements an optionally masked transformer as
-    described in
-    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
-
-*   [TransformerDecoderBlock](transformer.py) TransformerDecoderBlock is made up
-    of self multi-head attention, cross multi-head attention and feedforward
-    network.
-
-*   [RandomFeatureGaussianProcess](gaussian_process.py) implements random
-    feature-based Gaussian process described in ["Random Features for
-     Large-Scale Kernel Machines"](https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf).
-
-*   [ReZeroTransformer](rezero_transformer.py) implements Transformer with
-    ReZero described in
-    ["ReZero is All You Need: Fast Convergence at Large Depth"](https://arxiv.org/abs/2003.04887).
-
-*   [OnDeviceEmbedding](on_device_embedding.py) implements efficient embedding
-    lookups designed for TPU-based models.
-
-*   [PositionalEmbedding](position_embedding.py) creates a positional embedding
-    as described in ["BERT: Pre-training of Deep Bidirectional Transformers for
-    Language Understanding"](https://arxiv.org/abs/1810.04805).
-
-*   [SelfAttentionMask](self_attention_mask.py) creates a 3D attention mask from
-    a 2D tensor mask.
-
-*   [SpectralNormalization](spectral_normalization.py) implements a tf.Wrapper
-    that applies spectral normalization regularization to a given layer. See
-    [Spectral Norm Regularization for Improving the Generalizability of
-     Deep Learning](https://arxiv.org/abs/1705.10941)
-
-*   [MaskedSoftmax](masked_softmax.py) implements a softmax with an optional
-    masking input. If no mask is provided to this layer, it performs a standard
-    softmax; however, if a mask tensor is applied (which should be 1 in
-    positions where the data should be allowed through, and 0 where the data
-    should be masked), the output will have masked positions set to
-    approximately zero.
-
-*   [`MaskedLM`](masked_lm.py) implements a masked language model. It assumes
-    the embedding table variable is passed to it.
-
-*   [ClassificationHead](cls_head.py) A pooling head over a sequence of
-    embeddings, commonly used by classification tasks.
-
-*   [GaussianProcessClassificationHead](cls_head.py) A spectral-normalized
-    neural Gaussian process (SNGP)-based classification head as described in
-    ["Simple and Principled Uncertainty Estimation with Deterministic Deep
-     Learning via Distance Awareness"](https://arxiv.org/abs/2006.10108).
-
-*   [GatedFeedforward](gated_feedforward.py) implements the gated linear layer
-    feedforward as described in
-    ["GLU Variants Improve Transformer"](https://arxiv.org/abs/2002.05202).
-
-*   [MultiHeadRelativeAttention](relative_attention.py) implements a variant
-    of multi-head attention with support for relative position encodings as
-    described in ["Transformer-XL: Attentive Language Models Beyond a
-    Fixed-Length Context"](https://arxiv.org/abs/1901.02860). This also has
-    extended support for segment-based attention, a re-parameterization
-    introduced in ["XLNet: Generalized Autoregressive Pretraining for Language
-    Understanding"](https://arxiv.org/abs/1906.08237).
-
-*   [TwoStreamRelativeAttention](relative_attention.py) implements a variant
-    of multi-head relative attention as described in ["XLNet: Generalized
-    Autoregressive Pretraining for Language Understanding"]
-    (https://arxiv.org/abs/1906.08237). This takes in a query and content
-    stream and applies self attention.
-
-*   [TransformerXL](transformer_xl.py) implements Transformer XL introduced in
-    ["Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"]
-    (https://arxiv.org/abs/1901.02860). This contains `TransformerXLBlock`, a
-    block containing either one or two stream relative self-attention as well as
-    subsequent feedforward networks. It also contains `TransformerXL`, which
-    contains attention biases as well as multiple `TransformerXLBlocks`.
-
-*   [MobileBertEmbedding](mobile_bert_layers.py) and
-    [MobileBertTransformer](mobile_bert_layers.py) implement the embedding layer
-    and also transformer layer proposed in the
-    [MobileBERT paper](https://arxiv.org/pdf/2004.02984.pdf).
-
-*   [BertPackInputs](text_layers.py) and
-    [BertTokenizer](text_layers.py) and [SentencepieceTokenizer](text_layers.py)
-    implements the layer to tokenize raw text and pack them into the inputs for
-    BERT models.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/__init__.py
deleted file mode 100644
index 22c632e31087105a3cfe57da5d7affce235393ea..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/__init__.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Layers are the fundamental building blocks for NLP models.
-
-They can be used to assemble new `tf.keras` layers or models.
-"""
-# pylint: disable=wildcard-import
-from official.nlp.modeling.layers.attention import *
-from official.nlp.modeling.layers.bigbird_attention import BigBirdAttention
-from official.nlp.modeling.layers.bigbird_attention import BigBirdMasks
-from official.nlp.modeling.layers.cls_head import *
-from official.nlp.modeling.layers.dense_einsum import DenseEinsum
-from official.nlp.modeling.layers.gated_feedforward import GatedFeedforward
-from official.nlp.modeling.layers.gaussian_process import RandomFeatureGaussianProcess
-from official.nlp.modeling.layers.kernel_attention import KernelAttention
-from official.nlp.modeling.layers.kernel_attention import KernelMask
-from official.nlp.modeling.layers.masked_lm import MaskedLM
-from official.nlp.modeling.layers.masked_softmax import MaskedSoftmax
-from official.nlp.modeling.layers.mat_mul_with_margin import MatMulWithMargin
-from official.nlp.modeling.layers.mobile_bert_layers import MobileBertEmbedding
-from official.nlp.modeling.layers.mobile_bert_layers import MobileBertMaskedLM
-from official.nlp.modeling.layers.mobile_bert_layers import MobileBertTransformer
-from official.nlp.modeling.layers.multi_channel_attention import *
-from official.nlp.modeling.layers.on_device_embedding import OnDeviceEmbedding
-from official.nlp.modeling.layers.position_embedding import RelativePositionBias
-from official.nlp.modeling.layers.position_embedding import RelativePositionEmbedding
-from official.nlp.modeling.layers.relative_attention import MultiHeadRelativeAttention
-from official.nlp.modeling.layers.relative_attention import TwoStreamRelativeAttention
-from official.nlp.modeling.layers.rezero_transformer import ReZeroTransformer
-from official.nlp.modeling.layers.self_attention_mask import SelfAttentionMask
-from official.nlp.modeling.layers.spectral_normalization import *
-from official.nlp.modeling.layers.talking_heads_attention import TalkingHeadsAttention
-from official.nlp.modeling.layers.text_layers import BertPackInputs
-from official.nlp.modeling.layers.text_layers import BertTokenizer
-from official.nlp.modeling.layers.text_layers import SentencepieceTokenizer
-from official.nlp.modeling.layers.tn_transformer_expand_condense import TNTransformerExpandCondense
-from official.nlp.modeling.layers.transformer import *
-from official.nlp.modeling.layers.transformer_scaffold import TransformerScaffold
-from official.nlp.modeling.layers.transformer_xl import TransformerXL
-from official.nlp.modeling.layers.transformer_xl import TransformerXLBlock
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/attention.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/attention.py
deleted file mode 100644
index 50a6a5331183fcbee4d014564faa9a579e299749..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/attention.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based attention layer."""
-# pylint: disable=g-classes-have-attributes
-import math
-
-import tensorflow as tf
-
-EinsumDense = tf.keras.layers.experimental.EinsumDense
-MultiHeadAttention = tf.keras.layers.MultiHeadAttention
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class CachedAttention(tf.keras.layers.MultiHeadAttention):
-  """Attention layer with cache used for auto-agressive decoding.
-
-  Arguments are the same as `tf.keras.layers.MultiHeadAttention` layer.
-  """
-
-  def _update_cache(self, key, value, cache, decode_loop_step):
-    """Updates cache states and gets full-length key/value tensors."""
-    # Combines cached keys and values with new keys and values.
-    if decode_loop_step is not None:
-      # TPU special case.
-      key_seq_dim = cache["key"].shape.as_list()[1]
-      indices = tf.reshape(
-          tf.one_hot(decode_loop_step, key_seq_dim, dtype=key.dtype),
-          [1, key_seq_dim, 1, 1])
-      key = cache["key"] + key * indices
-      value_seq_dim = cache["value"].shape.as_list()[1]
-      indices = tf.reshape(
-          tf.one_hot(decode_loop_step, value_seq_dim, dtype=value.dtype),
-          [1, value_seq_dim, 1, 1])
-      value = cache["value"] + value * indices
-    else:
-      key = tf.concat([tf.cast(cache["key"], key.dtype), key], axis=1)
-      value = tf.concat([tf.cast(cache["value"], value.dtype), value], axis=1)
-
-    # Update cache
-    cache["key"] = key
-    cache["value"] = value
-
-    return key, value
-
-  def call(self,
-           query,
-           value,
-           key=None,
-           attention_mask=None,
-           cache=None,
-           decode_loop_step=None,
-           return_attention_scores=False):
-    if not self._built_from_signature:
-      self._build_from_signature(query=query, value=value, key=key)
-    if key is None:
-      key = value
-
-    # Scalar dimensions referenced here:
-    #   B = batch size (number of sequences)
-    #   F = `from_tensor` sequence length
-    #   T = `to_tensor` sequence length
-    #   N = `num_attention_heads`
-    #   H = `size_per_head`
-    # `query` = [B, F, N ,H]
-    query = self._query_dense(query)
-
-    # `key` = [B, T, N, H]
-    key = self._key_dense(key)
-
-    # `value` = [B, T, N, H]
-    value = self._value_dense(value)
-
-    if cache:
-      key, value = self._update_cache(key, value, cache, decode_loop_step)
-
-    query = tf.multiply(query, 1.0 / math.sqrt(float(self._key_dim)))
-
-    # Take the dot product between "query" and "key" to get the raw
-    # attention scores.
-    attention_scores = tf.einsum(self._dot_product_equation, key, query)
-
-    # Normalize the attention scores to probabilities.
-    # `attention_scores` = [B, N, F, T]
-    attention_scores = self._masked_softmax(attention_scores, attention_mask)
-
-    # This is actually dropping out entire tokens to attend to, which might
-    # seem a bit unusual, but is taken from the original Transformer paper.
-    attention_scores = self._dropout_layer(attention_scores)
-    # `context_layer` = [B, F, N, H]
-    attention_output = tf.einsum(self._combine_equation, attention_scores,
-                                 value)
-    attention_output = self._output_dense(attention_output)
-    if return_attention_scores:
-      return attention_output, attention_scores, cache
-    return attention_output, cache
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/attention_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/attention_test.py
deleted file mode 100644
index 03849f84dfd4cfd90ca4a077781a4bf97986b3cf..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/attention_test.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for the attention layer."""
-
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import attention
-
-
-def _create_cache(batch_size, init_decode_length, num_heads, head_size):
-  return {
-      "key":
-          tf.zeros([batch_size, init_decode_length, num_heads, head_size],
-                   dtype=tf.float32),
-      "value":
-          tf.zeros([batch_size, init_decode_length, num_heads, head_size],
-                   dtype=tf.float32)
-  }
-
-
-@keras_parameterized.run_all_keras_modes
-class CachedAttentionTest(keras_parameterized.TestCase):
-
-  def test_masked_attention(self):
-    """Test with a mask tensor."""
-    num_heads, head_size = 2, 2
-    # Create a 3-dimensional input (the first dimension is implicit).
-    from_seq_length = 4
-    batch_size = 3
-    # GPU/CPU case.
-    init_decode_length = 0
-    # Directly tests the keras layer.
-    cache = _create_cache(batch_size, init_decode_length, num_heads, head_size)
-    layer = attention.CachedAttention(num_heads=num_heads, key_dim=head_size)
-
-    # Generate data for the input (non-mask) tensors.
-    from_data = tf.zeros((batch_size, from_seq_length, 8), dtype=np.float32)
-    # Invoke the data with a random set of mask data. This should mask at least
-    # one element.
-    mask_data = np.random.randint(
-        2, size=(batch_size, from_seq_length, from_seq_length))
-    masked_output_data, cache = layer(
-        query=from_data, value=from_data, attention_mask=mask_data, cache=cache)
-    self.assertEqual(masked_output_data.shape, (3, 4, 8))
-    self.assertEqual(cache["value"].shape, (3, 4, 2, 2))
-
-    # Tests inputs without cache.
-    masked_output_data, cache = layer(
-        query=from_data, value=from_data, attention_mask=mask_data)
-    self.assertEqual(masked_output_data.shape, (3, 4, 8))
-    self.assertIsNone(cache)
-
-  def test_padded_decode(self):
-    """Test with a mask tensor."""
-    num_heads, head_size = 2, 2
-    from_seq_length = 4
-    # TPU decoding should pre-allocate the entire sequence.
-    batch_size = 3
-    init_decode_length = from_seq_length
-
-    # Directly tests the keras layer.
-    cache = _create_cache(batch_size, init_decode_length, num_heads, head_size)
-    layer = attention.CachedAttention(num_heads=num_heads, key_dim=head_size)
-
-    # Generate data for the input (non-mask) tensors.
-    from_data = tf.zeros((batch_size, from_seq_length, 8), dtype=np.float32)
-    decode_loop_step = 2
-    mask_data = np.random.randint(
-        2, size=(batch_size, from_seq_length, from_seq_length), dtype=np.int32)
-    # Testing the invocation directly as Keras cannot consume inputs correctly.
-    masked_output_data, cache = layer(
-        query=from_data,
-        value=from_data,
-        attention_mask=mask_data,
-        cache=cache,
-        decode_loop_step=decode_loop_step)
-    self.assertEqual(masked_output_data.shape, (3, 4, 8))
-    self.assertEqual(cache["value"].shape, (3, 4, 2, 2))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/bigbird_attention.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/bigbird_attention.py
deleted file mode 100644
index 5b37196528a5a0f4c52b655d336fbe1c7911bb6f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/bigbird_attention.py
+++ /dev/null
@@ -1,508 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based bigbird attention layer."""
-
-import numpy as np
-import tensorflow as tf
-
-MAX_SEQ_LEN = 4096
-
-
-def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask):
-  """Create 3D attention mask from a 2D tensor mask.
-
-  Args:
-    from_blocked_mask: 2D Tensor of shape [batch_size,
-      from_seq_length//from_block_size, from_block_size].
-    to_blocked_mask: int32 Tensor of shape [batch_size,
-      to_seq_length//to_block_size, to_block_size].
-
-  Returns:
-    float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4,
-                           from_block_size,  3*to_block_size].
-  """
-  exp_blocked_to_pad = tf.concat([
-      to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:,
-                                                                          3:-1]
-  ], 2)
-  band_mask = tf.einsum("BLQ,BLK->BLQK", from_blocked_mask[:, 2:-2],
-                        exp_blocked_to_pad)
-  band_mask = tf.expand_dims(band_mask, 1)
-  return band_mask
-
-
-def bigbird_block_rand_mask(from_seq_length,
-                            to_seq_length,
-                            from_block_size,
-                            to_block_size,
-                            num_rand_blocks,
-                            last_idx=-1):
-  """Create adjacency list of random attention.
-
-  Args:
-    from_seq_length: int. length of from sequence.
-    to_seq_length: int. length of to sequence.
-    from_block_size: int. size of block in from sequence.
-    to_block_size: int. size of block in to sequence.
-    num_rand_blocks: int. Number of random chunks per row.
-    last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence,
-      if positive then num_rand_blocks blocks choosen only upto last_idx.
-
-  Returns:
-    adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks
-  """
-  assert from_seq_length//from_block_size == to_seq_length//to_block_size, \
-      "Error the number of blocks needs to be same!"
-
-  rand_attn = np.zeros(
-      (from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
-  middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
-  last = to_seq_length // to_block_size - 1
-  if last_idx > (2 * to_block_size):
-    last = (last_idx // to_block_size) - 1
-
-  r = num_rand_blocks  # shorthand
-  for i in range(1, from_seq_length // from_block_size - 1):
-    start = i - 2
-    end = i
-    if i == 1:
-      rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r]
-    elif i == 2:
-      rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r]
-    elif i == from_seq_length // from_block_size - 3:
-      rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r]
-      # Missing -3: should have been sliced till last-3
-    elif i == from_seq_length // from_block_size - 2:
-      rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r]
-      # Missing -4: should have been sliced till last-4
-    else:
-      if start > last:
-        start = last
-        rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r]
-      elif (end + 1) == last:
-        rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r]
-      else:
-        rand_attn[i - 1, :] = np.random.permutation(
-            np.concatenate((middle_seq[:start], middle_seq[end + 1:last])))[:r]
-  return rand_attn
-
-
-def create_rand_mask_from_inputs(from_blocked_mask, to_blocked_mask, rand_attn,
-                                 num_attention_heads, num_rand_blocks,
-                                 batch_size, from_seq_length, from_block_size):
-  """Create 3D attention mask from a 2D tensor mask.
-
-  Args:
-    from_blocked_mask: 2D Tensor of shape [batch_size,
-      from_seq_length//from_block_size, from_block_size].
-    to_blocked_mask: int32 Tensor of shape [batch_size,
-      to_seq_length//to_block_size, to_block_size].
-    rand_attn: [batch_size, num_attention_heads,
-      from_seq_length//from_block_size-2, num_rand_blocks]
-    num_attention_heads: int. Number of attention heads.
-    num_rand_blocks: int. Number of random chunks per row.
-    batch_size: int. Batch size for computation.
-    from_seq_length: int. length of from sequence.
-    from_block_size: int. size of block in from sequence.
-
-  Returns:
-    float Tensor of shape [batch_size, num_attention_heads,
-                           from_seq_length//from_block_size-2,
-                           from_block_size, num_rand_blocks*to_block_size].
-  """
-  num_windows = from_seq_length // from_block_size - 2
-  rand_mask = tf.reshape(
-      tf.gather(to_blocked_mask, rand_attn, batch_dims=1), [
-          batch_size, num_attention_heads, num_windows,
-          num_rand_blocks * from_block_size
-      ])
-  rand_mask = tf.einsum("BLQ,BHLK->BHLQK", from_blocked_mask[:, 1:-1],
-                        rand_mask)
-  return rand_mask
-
-
-def bigbird_block_sparse_attention(
-    query_layer, key_layer, value_layer, band_mask, from_mask, to_mask,
-    from_blocked_mask, to_blocked_mask, rand_attn, num_attention_heads,
-    num_rand_blocks, size_per_head, batch_size, from_seq_length, to_seq_length,
-    from_block_size, to_block_size):
-  """BigBird attention sparse calculation using blocks in linear time.
-
-  Assumes from_seq_length//from_block_size == to_seq_length//to_block_size.
-
-
-  Args:
-    query_layer: float Tensor of shape [batch_size, num_attention_heads,
-      from_seq_length, size_per_head]
-    key_layer: float Tensor of shape [batch_size, num_attention_heads,
-      to_seq_length, size_per_head]
-    value_layer: float Tensor of shape [batch_size, num_attention_heads,
-      to_seq_length, size_per_head]
-    band_mask: (optional) int32 Tensor of shape [batch_size, 1,
-      from_seq_length//from_block_size-4, from_block_size, 3*to_block_size]. The
-      values should be 1 or 0. The attention scores will effectively be set to
-      -infinity for any positions in the mask that are 0, and will be unchanged
-      for positions that are 1.
-    from_mask: (optional) int32 Tensor of shape [batch_size, 1, from_seq_length,
-      1]. The values should be 1 or 0. The attention scores will effectively be
-      set to -infinity for any positions in the mask that are 0, and will be
-      unchanged for positions that are 1.
-    to_mask: (optional) int32 Tensor of shape [batch_size, 1, 1, to_seq_length].
-      The values should be 1 or 0. The attention scores will effectively be set
-      to -infinity for any positions in the mask that are 0, and will be
-      unchanged for positions that are 1.
-    from_blocked_mask: (optional) int32 Tensor of shape [batch_size,
-      from_seq_length//from_block_size, from_block_size]. Same as from_mask,
-      just reshaped.
-    to_blocked_mask: (optional) int32 Tensor of shape [batch_size,
-      to_seq_length//to_block_size, to_block_size]. Same as to_mask, just
-      reshaped.
-    rand_attn: [batch_size, num_attention_heads,
-      from_seq_length//from_block_size-2, num_rand_blocks]
-    num_attention_heads: int. Number of attention heads.
-    num_rand_blocks: int. Number of random chunks per row.
-    size_per_head: int. Size of each attention head.
-    batch_size: int. Batch size for computation.
-    from_seq_length: int. length of from sequence.
-    to_seq_length: int. length of to sequence.
-    from_block_size: int. size of block in from sequence.
-    to_block_size: int. size of block in to sequence.
-
-  Returns:
-    float Tensor of shape [batch_size, from_seq_length, num_attention_heads,
-      size_per_head].
-  """
-  rand_attn = tf.expand_dims(rand_attn, 0)
-  rand_attn = tf.repeat(rand_attn, batch_size, 0)
-
-  rand_mask = create_rand_mask_from_inputs(
-      from_blocked_mask,
-      to_blocked_mask,
-      rand_attn,
-      num_attention_heads,
-      num_rand_blocks,
-      batch_size,
-      from_seq_length,
-      from_block_size,
-  )
-
-  # Define shorthands
-  h = num_attention_heads
-  r = num_rand_blocks
-  d = size_per_head
-  b = batch_size
-  m = from_seq_length
-  n = to_seq_length
-  wm = from_block_size
-  wn = to_block_size
-  dtype = query_layer.dtype
-  query_layer = tf.transpose(query_layer, perm=[0, 2, 1, 3])
-  key_layer = tf.transpose(key_layer, perm=[0, 2, 1, 3])
-  value_layer = tf.transpose(value_layer, perm=[0, 2, 1, 3])
-  blocked_query_matrix = tf.reshape(query_layer, (b, h, m // wm, wm, -1))
-  blocked_key_matrix = tf.reshape(key_layer, (b, h, n // wn, wn, -1))
-  blocked_value_matrix = tf.reshape(value_layer, (b, h, n // wn, wn, -1))
-  gathered_key = tf.reshape(
-      tf.gather(blocked_key_matrix, rand_attn, batch_dims=2, name="gather_key"),
-      (b, h, m // wm - 2, r * wn, -1))  # [b, h, n//wn-2, r, wn, -1]
-  gathered_value = tf.reshape(
-      tf.gather(
-          blocked_value_matrix, rand_attn, batch_dims=2, name="gather_value"),
-      (b, h, m // wm - 2, r * wn, -1))  # [b, h, n//wn-2, r, wn, -1]
-  first_product = tf.einsum(
-      "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, 0],
-      key_layer)  # [b, h, wm, -1] x [b, h, n, -1] ==> [b, h, wm, n]
-  first_product = tf.multiply(first_product, 1.0 / np.sqrt(d))
-  first_product += (1.0 - tf.cast(to_mask, dtype=dtype)) * -10000.0
-  first_attn_weights = tf.nn.softmax(first_product)  # [b, h, wm, n]
-  first_context_layer = tf.einsum(
-      "BHQK,BHKD->BHQD", first_attn_weights,
-      value_layer)  # [b, h, wm, n] x [b, h, n, -1] ==> [b, h, wm, -1]
-  first_context_layer = tf.expand_dims(first_context_layer, 2)
-
-  second_key_mat = tf.concat([
-      blocked_key_matrix[:, :, 0], blocked_key_matrix[:, :, 1],
-      blocked_key_matrix[:, :, 2], blocked_key_matrix[:, :,
-                                                      -1], gathered_key[:, :, 0]
-  ], 2)  # [b, h, (4+r)*wn, -1]
-  second_value_mat = tf.concat([
-      blocked_value_matrix[:, :, 0], blocked_value_matrix[:, :, 1],
-      blocked_value_matrix[:, :, 2], blocked_value_matrix[:, :, -1],
-      gathered_value[:, :, 0]
-  ], 2)  # [b, h, (4+r)*wn, -1]
-  second_product = tf.einsum(
-      "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, 1], second_key_mat
-  )  # [b, h, wm, -1] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, (4+r)*wn]
-  second_seq_pad = tf.concat([
-      to_mask[:, :, :, :3 * wn], to_mask[:, :, :, -wn:],
-      tf.ones([b, 1, 1, r * wn], dtype=dtype)
-  ], 3)
-  second_rand_pad = tf.concat([
-      tf.ones([b, h, wm, 4 * wn], dtype=dtype), rand_mask[:, :, 0]
-  ], 3)
-  second_product = tf.multiply(second_product, 1.0 / np.sqrt(d))
-  second_product += (1.0 -
-                     tf.minimum(second_seq_pad, second_rand_pad)) * -10000.0
-  second_attn_weights = tf.nn.softmax(second_product)  # [b , h, wm, (4+r)*wn]
-  second_context_layer = tf.einsum(
-      "BHQK,BHKD->BHQD", second_attn_weights, second_value_mat
-  )  # [b, h, wm, (4+r)*wn] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, -1]
-  second_context_layer = tf.expand_dims(second_context_layer, 2)
-
-  exp_blocked_key_matrix = tf.concat([
-      blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2],
-      blocked_key_matrix[:, :, 3:-1]
-  ], 3)  # [b, h, m//wm-4, 3*wn, -1]
-  exp_blocked_value_matrix = tf.concat([
-      blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2],
-      blocked_value_matrix[:, :, 3:-1]
-  ], 3)  # [b, h, m//wm-4, 3*wn, -1]
-  middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
-  inner_band_product = tf.einsum(
-      "BHLQD,BHLKD->BHLQK", middle_query_matrix, exp_blocked_key_matrix
-  )  # [b, h, m//wm-4, wm, -1] x [b, h, m//wm-4, 3*wn, -1]
-  #     ==> [b, h, m//wm-4, wm, 3*wn]
-  inner_band_product = tf.multiply(inner_band_product, 1.0 / np.sqrt(d))
-  rand_band_product = tf.einsum(
-      "BHLQD,BHLKD->BHLQK", middle_query_matrix,
-      gathered_key[:, :,
-                   1:-1])  # [b, h, m//wm-4, wm, -1] x [b, h, m//wm-4, r*wn, -1]
-  #     ==> [b, h, m//wm-4, wm, r*wn]
-  rand_band_product = tf.multiply(rand_band_product, 1.0 / np.sqrt(d))
-  first_band_product = tf.einsum(
-      "BHLQD,BHKD->BHLQK", middle_query_matrix, blocked_key_matrix[:, :, 0]
-  )  # [b, h, m//wm-4, wm, -1] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, wn]
-  first_band_product = tf.multiply(first_band_product, 1.0 / np.sqrt(d))
-  last_band_product = tf.einsum(
-      "BHLQD,BHKD->BHLQK", middle_query_matrix, blocked_key_matrix[:, :, -1]
-  )  # [b, h, m//wm-4, wm, -1] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, wn]
-  last_band_product = tf.multiply(last_band_product, 1.0 / np.sqrt(d))
-  inner_band_product += (1.0 - band_mask) * -10000.0
-  first_band_product += (1.0 -
-                         tf.expand_dims(to_mask[:, :, :, :wn], 3)) * -10000.0
-  last_band_product += (1.0 -
-                        tf.expand_dims(to_mask[:, :, :, -wn:], 3)) * -10000.0
-  rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * -10000.0
-  band_product = tf.concat([
-      first_band_product, inner_band_product, rand_band_product,
-      last_band_product
-  ], -1)  # [b, h, m//wm-4, wm, (5+r)*wn]
-  attn_weights = tf.nn.softmax(band_product)  # [b, h, m//wm-4, wm, (5+r)*wn]
-  context_layer = tf.einsum(
-      "BHLQK,BHLKD->BHLQD", attn_weights[:, :, :, :,
-                                         wn:4 * wn], exp_blocked_value_matrix
-  )  # [b, h, m//wm-4, wm, 3*wn] x [b, h, m//wm-4, 3*wn, -1]
-  #     ==> [b, h, m//wm-4, wm, -1]
-  context_layer += tf.einsum(
-      "BHLQK,BHLKD->BHLQD", attn_weights[:, :, :, :,
-                                         4 * wn:-wn], gathered_value[:, :, 1:-1]
-  )  # [b, h, m//wm-4, wm, r*wn] x [b, h, m//wm-4, r*wn, -1]
-  #     ==> [b, h, m//wm-4, wm, -1]
-  context_layer += tf.einsum(
-      "BHLQK,BHKD->BHLQD", attn_weights[:, :, :, :, :wn],
-      blocked_value_matrix[:, :, 0]
-  )  # [b, h, m//wm-4, wm, wn] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, -1]
-  context_layer += tf.einsum(
-      "BHLQK,BHKD->BHLQD", attn_weights[:, :, :, :,
-                                        -wn:], blocked_value_matrix[:, :, -1]
-  )  # [b, h, m//wm-4, wm, wn] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, -1]
-
-  second_last_key_mat = tf.concat([
-      blocked_key_matrix[:, :, 0], blocked_key_matrix[:, :, -3],
-      blocked_key_matrix[:, :, -2], blocked_key_matrix[:, :, -1],
-      gathered_key[:, :, -1]
-  ], 2)  # [b, h, (4+r)*wn, -1]
-  second_last_value_mat = tf.concat([
-      blocked_value_matrix[:, :, 0], blocked_value_matrix[:, :, -3],
-      blocked_value_matrix[:, :, -2], blocked_value_matrix[:, :, -1],
-      gathered_value[:, :, -1]
-  ], 2)  # [b, h, (4+r)*wn, -1]
-  second_last_product = tf.einsum(
-      "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, -2], second_last_key_mat
-  )  # [b, h, wm, -1] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, (4+r)*wn]
-  second_last_seq_pad = tf.concat([
-      to_mask[:, :, :, :wn], to_mask[:, :, :, -3 * wn:],
-      tf.ones([b, 1, 1, r * wn], dtype=dtype)
-  ], 3)
-  second_last_rand_pad = tf.concat(
-      [tf.ones([b, h, wm, 4 * wn], dtype=dtype), rand_mask[:, :, -1]], 3)
-  second_last_product = tf.multiply(second_last_product, 1.0 / np.sqrt(d))
-  second_last_product += (
-      1.0 - tf.minimum(second_last_seq_pad, second_last_rand_pad)) * -10000.0
-  second_last_attn_weights = tf.nn.softmax(
-      second_last_product)  # [b, h, wm, (4+r)*wn]
-  second_last_context_layer = tf.einsum(
-      "BHQK,BHKD->BHQD", second_last_attn_weights, second_last_value_mat
-  )  # [b, h, wm, (4+r)*wn] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, -1]
-  second_last_context_layer = tf.expand_dims(second_last_context_layer, 2)
-
-  last_product = tf.einsum(
-      "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, -1],
-      key_layer)  # [b, h, wm, -1] x [b, h, n, -1] ==> [b, h, wm, n]
-  last_product = tf.multiply(last_product, 1.0 / np.sqrt(d))
-  last_product += (1.0 - to_mask) * -10000.0
-  last_attn_weights = tf.nn.softmax(last_product)  # [b, h, wm, n]
-  last_context_layer = tf.einsum(
-      "BHQK,BHKD->BHQD", last_attn_weights,
-      value_layer)  # [b, h, wm, n] x [b, h, n, -1] ==> [b, h, wm, -1]
-  last_context_layer = tf.expand_dims(last_context_layer, 2)
-
-  context_layer = tf.concat([
-      first_context_layer, second_context_layer, context_layer,
-      second_last_context_layer, last_context_layer
-  ], 2)
-  context_layer = tf.reshape(context_layer, (b, h, m, -1)) * from_mask
-  context_layer = tf.transpose(context_layer, (0, 2, 1, 3))
-  return context_layer
-
-
-class BigBirdMasks(tf.keras.layers.Layer):
-  """Creates bigbird attention masks."""
-
-  def __init__(self, block_size, **kwargs):
-    super().__init__(**kwargs)
-    self._block_size = block_size
-
-  def call(self, inputs, mask):
-    encoder_shape = tf.shape(mask)
-    mask = tf.cast(mask, inputs.dtype)
-    batch_size, seq_length = encoder_shape[0], encoder_shape[1]
-    # reshape for blocking
-    blocked_encoder_mask = tf.reshape(
-        mask, (batch_size, seq_length // self._block_size, self._block_size))
-    encoder_from_mask = tf.reshape(mask, (batch_size, 1, seq_length, 1))
-    encoder_to_mask = tf.reshape(mask, (batch_size, 1, 1, seq_length))
-
-    band_mask = create_band_mask_from_inputs(blocked_encoder_mask,
-                                             blocked_encoder_mask)
-    return [band_mask, encoder_from_mask, encoder_to_mask, blocked_encoder_mask]
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class BigBirdAttention(tf.keras.layers.MultiHeadAttention):
-  """BigBird, a sparse attention mechanism.
-
-  This layer follows the paper "Big Bird: Transformers for Longer Sequences"
-  (https://arxiv.org/abs/2007.14062).
-  It reduces this quadratic dependency of attention
-  computation to linear.
-
-  Arguments are the same as `MultiHeadAttention` layer.
-  """
-
-  def __init__(self,
-               num_rand_blocks=3,
-               from_block_size=64,
-               to_block_size=64,
-               max_rand_mask_length=MAX_SEQ_LEN,
-               seed=None,
-               **kwargs):
-    super().__init__(**kwargs)
-    self._num_rand_blocks = num_rand_blocks
-    self._from_block_size = from_block_size
-    self._to_block_size = to_block_size
-    self._seed = seed
-
-    # Generates random attention.
-    np.random.seed(self._seed)
-    # pylint: disable=g-complex-comprehension
-    rand_attn = [
-        bigbird_block_rand_mask(
-            max_rand_mask_length,
-            max_rand_mask_length,
-            from_block_size,
-            to_block_size,
-            num_rand_blocks,
-            last_idx=1024) for _ in range(self._num_heads)
-    ]
-    # pylint: enable=g-complex-comprehension
-    rand_attn = np.stack(rand_attn, axis=0)
-    self.rand_attn = tf.constant(rand_attn, dtype=tf.int32)
-
-  def _compute_attention(self, query, key, value, attention_mask=None):
-    (band_mask, encoder_from_mask, encoder_to_mask,
-     blocked_encoder_mask) = attention_mask
-    query_shape = tf.shape(query)
-    from_seq_length = query_shape[1]
-    to_seq_length = tf.shape(key)[1]
-    rand_attn = self.rand_attn[:, :(from_seq_length // self._from_block_size -
-                                    2)]
-    return bigbird_block_sparse_attention(
-        query,
-        key,
-        value,
-        band_mask,
-        encoder_from_mask,
-        encoder_to_mask,
-        blocked_encoder_mask,
-        blocked_encoder_mask,
-        num_attention_heads=self._num_heads,
-        num_rand_blocks=self._num_rand_blocks,
-        size_per_head=self._key_dim,
-        batch_size=query_shape[0],
-        from_seq_length=from_seq_length,
-        to_seq_length=to_seq_length,
-        from_block_size=self._from_block_size,
-        to_block_size=self._to_block_size,
-        rand_attn=rand_attn)
-
-  def call(self, query, value, key=None, attention_mask=None, **kwargs):
-    if not self._built_from_signature:
-      self._build_from_signature(query=query, value=value, key=key)
-    if key is None:
-      key = value
-
-    #   N = `num_attention_heads`
-    #   H = `size_per_head`
-    # `query` = [B, T, N ,H]
-    query = self._query_dense(query)
-
-    # `key` = [B, S, N, H]
-    key = self._key_dense(key)
-
-    # `value` = [B, S, N, H]
-    value = self._value_dense(value)
-
-    attention_output = self._compute_attention(query, key, value,
-                                               attention_mask)
-    attention_output.set_shape([None, None, self._num_heads, self._value_dim])
-    attention_output = self._output_dense(attention_output)
-    return attention_output
-
-  def get_config(self):
-    config = {
-        "num_rand_blocks": self._num_rand_blocks,
-        "from_block_size": self._from_block_size,
-        "to_block_size": self._to_block_size,
-        "seed": self._seed
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/bigbird_attention_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/bigbird_attention_test.py
deleted file mode 100644
index eca6e4388672349e533611f363ad0d38ce5661ec..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/bigbird_attention_test.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.projects.bigbird.attention."""
-
-import tensorflow as tf
-
-from official.nlp.modeling.layers import bigbird_attention as attention
-
-
-class BigbirdAttentionTest(tf.test.TestCase):
-
-  def test_attention(self):
-    num_heads = 12
-    key_dim = 64
-    seq_length = 1024
-    batch_size = 2
-    block_size = 64
-    mask_layer = attention.BigBirdMasks(block_size=block_size)
-    encoder_inputs_mask = tf.zeros((batch_size, seq_length), dtype=tf.int32)
-    test_layer = attention.BigBirdAttention(
-        num_heads=num_heads,
-        key_dim=key_dim,
-        from_block_size=block_size,
-        to_block_size=block_size,
-        seed=0)
-    query = tf.random.normal(
-        shape=(batch_size, seq_length, key_dim))
-    masks = mask_layer(query, tf.cast(encoder_inputs_mask, dtype=tf.float64))
-    value = query
-    output = test_layer(
-        query=query,
-        value=value,
-        attention_mask=masks)
-    self.assertEqual(output.shape, [batch_size, seq_length, key_dim])
-
-  def test_config(self):
-    num_heads = 12
-    key_dim = 64
-    block_size = 64
-    test_layer = attention.BigBirdAttention(
-        num_heads=num_heads,
-        key_dim=key_dim,
-        from_block_size=block_size,
-        to_block_size=block_size,
-        seed=0)
-    print(test_layer.get_config())
-    new_layer = attention.BigBirdAttention.from_config(
-        test_layer.get_config())
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/cls_head.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/cls_head.py
deleted file mode 100644
index 8eeac483e94d7cdcf4ca0f8bc7d5a9c0e6d25ca6..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/cls_head.py
+++ /dev/null
@@ -1,350 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A Classification head layer which is common used with sequence encoders."""
-
-import tensorflow as tf
-
-from official.modeling import tf_utils
-
-from official.nlp.modeling.layers import gaussian_process
-from official.nlp.modeling.layers import spectral_normalization
-
-
-class ClassificationHead(tf.keras.layers.Layer):
-  """Pooling head for sentence-level classification tasks."""
-
-  def __init__(self,
-               inner_dim,
-               num_classes,
-               cls_token_idx=0,
-               activation="tanh",
-               dropout_rate=0.0,
-               initializer="glorot_uniform",
-               **kwargs):
-    """Initializes the `ClassificationHead`.
-
-    Args:
-      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
-        then only the output projection layer is created.
-      num_classes: Number of output classes.
-      cls_token_idx: The index inside the sequence to pool.
-      activation: Dense layer activation.
-      dropout_rate: Dropout probability.
-      initializer: Initializer for dense layer kernels.
-      **kwargs: Keyword arguments.
-    """
-    super().__init__(**kwargs)
-    self.dropout_rate = dropout_rate
-    self.inner_dim = inner_dim
-    self.num_classes = num_classes
-    self.activation = tf_utils.get_activation(activation)
-    self.initializer = tf.keras.initializers.get(initializer)
-    self.cls_token_idx = cls_token_idx
-
-    if self.inner_dim:
-      self.dense = tf.keras.layers.Dense(
-          units=self.inner_dim,
-          activation=self.activation,
-          kernel_initializer=self.initializer,
-          name="pooler_dense")
-      self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
-
-    self.out_proj = tf.keras.layers.Dense(
-        units=num_classes, kernel_initializer=self.initializer, name="logits")
-
-  def call(self, features):
-    if not self.inner_dim:
-      x = features
-    else:
-      x = features[:, self.cls_token_idx, :]  # take <CLS> token.
-      x = self.dense(x)
-      x = self.dropout(x)
-
-    x = self.out_proj(x)
-    return x
-
-  def get_config(self):
-    config = {
-        "cls_token_idx": self.cls_token_idx,
-        "dropout_rate": self.dropout_rate,
-        "num_classes": self.num_classes,
-        "inner_dim": self.inner_dim,
-        "activation": tf.keras.activations.serialize(self.activation),
-        "initializer": tf.keras.initializers.serialize(self.initializer),
-    }
-    config.update(super(ClassificationHead, self).get_config())
-    return config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-  @property
-  def checkpoint_items(self):
-    return {self.dense.name: self.dense}
-
-
-class MultiClsHeads(tf.keras.layers.Layer):
-  """Pooling heads sharing the same pooling stem."""
-
-  def __init__(self,
-               inner_dim,
-               cls_list,
-               cls_token_idx=0,
-               activation="tanh",
-               dropout_rate=0.0,
-               initializer="glorot_uniform",
-               **kwargs):
-    """Initializes the `MultiClsHeads`.
-
-    Args:
-      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
-        then only the output projection layer is created.
-      cls_list: a list of pairs of (classification problem name and the numbers
-        of classes.
-      cls_token_idx: The index inside the sequence to pool.
-      activation: Dense layer activation.
-      dropout_rate: Dropout probability.
-      initializer: Initializer for dense layer kernels.
-      **kwargs: Keyword arguments.
-    """
-    super().__init__(**kwargs)
-    self.dropout_rate = dropout_rate
-    self.inner_dim = inner_dim
-    self.cls_list = cls_list
-    self.activation = tf_utils.get_activation(activation)
-    self.initializer = tf.keras.initializers.get(initializer)
-    self.cls_token_idx = cls_token_idx
-
-    if self.inner_dim:
-      self.dense = tf.keras.layers.Dense(
-          units=inner_dim,
-          activation=self.activation,
-          kernel_initializer=self.initializer,
-          name="pooler_dense")
-      self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
-    self.out_projs = []
-    for name, num_classes in cls_list:
-      self.out_projs.append(
-          tf.keras.layers.Dense(
-              units=num_classes, kernel_initializer=self.initializer,
-              name=name))
-
-  def call(self, features):
-    if not self.inner_dim:
-      x = features
-    else:
-      x = features[:, self.cls_token_idx, :]  # take <CLS> token.
-      x = self.dense(x)
-      x = self.dropout(x)
-
-    outputs = {}
-    for proj_layer in self.out_projs:
-      outputs[proj_layer.name] = proj_layer(x)
-    return outputs
-
-  def get_config(self):
-    config = {
-        "dropout_rate": self.dropout_rate,
-        "cls_token_idx": self.cls_token_idx,
-        "cls_list": self.cls_list,
-        "inner_dim": self.inner_dim,
-        "activation": tf.keras.activations.serialize(self.activation),
-        "initializer": tf.keras.initializers.serialize(self.initializer),
-    }
-    config.update(super().get_config())
-    return config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-  @property
-  def checkpoint_items(self):
-    items = {self.dense.name: self.dense}
-    items.update({v.name: v for v in self.out_projs})
-    return items
-
-
-class GaussianProcessClassificationHead(ClassificationHead):
-  """Gaussian process-based pooling head for sentence classification.
-
-  This class implements a classifier head for BERT encoder that is based on the
-  spectral-normalized neural Gaussian process (SNGP) [1]. SNGP is a simple
-  method to improve a neural network's uncertainty quantification ability
-  without sacrificing accuracy or lantency. It applies spectral normalization to
-  the hidden pooler layer, and then replaces the dense output layer with a
-  Gaussian process.
-
-
-  [1]: Jeremiah Liu et al. Simple and Principled Uncertainty Estimation with
-       Deterministic Deep Learning via Distance Awareness.
-       In _Neural Information Processing Systems_, 2020.
-       https://arxiv.org/abs/2006.10108
-  """
-
-  def __init__(self,
-               inner_dim,
-               num_classes,
-               cls_token_idx=0,
-               activation="tanh",
-               dropout_rate=0.0,
-               initializer="glorot_uniform",
-               use_spec_norm=True,
-               use_gp_layer=True,
-               temperature=None,
-               **kwargs):
-    """Initializes the `GaussianProcessClassificationHead`.
-
-    Args:
-      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
-        then only the output projection layer is created.
-      num_classes: Number of output classes.
-      cls_token_idx: The index inside the sequence to pool.
-      activation: Dense layer activation.
-      dropout_rate: Dropout probability.
-      initializer: Initializer for dense layer kernels.
-      use_spec_norm: Whether to apply spectral normalization to pooler layer.
-      use_gp_layer: Whether to use Gaussian process as the output layer.
-      temperature: The temperature parameter to be used for mean-field
-        approximation during inference. If None then no mean-field adjustment is
-        applied.
-      **kwargs: Additional keyword arguments.
-    """
-    # Collects spectral normalization and Gaussian process args from kwargs.
-    self.use_spec_norm = use_spec_norm
-    self.use_gp_layer = use_gp_layer
-    self.spec_norm_kwargs = extract_spec_norm_kwargs(kwargs)
-    self.gp_layer_kwargs = extract_gp_layer_kwargs(kwargs)
-    self.temperature = temperature
-
-    super().__init__(
-        inner_dim=inner_dim,
-        num_classes=num_classes,
-        cls_token_idx=cls_token_idx,
-        activation=activation,
-        dropout_rate=dropout_rate,
-        initializer=initializer,
-        **kwargs)
-
-    # Applies spectral normalization to the dense pooler layer.
-    if self.use_spec_norm and hasattr(self, "dense"):
-      self.dense = spectral_normalization.SpectralNormalization(
-          self.dense, inhere_layer_name=True, **self.spec_norm_kwargs)
-
-    # Replace Dense output layer with the Gaussian process layer.
-    if use_gp_layer:
-      self.out_proj = gaussian_process.RandomFeatureGaussianProcess(
-          self.num_classes,
-          kernel_initializer=self.initializer,
-          name="logits",
-          **self.gp_layer_kwargs)
-
-  def call(self, features, training=False, return_covmat=False):
-    """Returns model output.
-
-    Dring training, the model returns raw logits. During evaluation, the model
-    returns uncertainty adjusted logits, and (optionally) the covariance matrix.
-
-    Arguments:
-      features: A tensor of input features, shape (batch_size, feature_dim).
-      training: Whether the model is in training mode.
-      return_covmat: Whether the model should also return covariance matrix if
-        `use_gp_layer=True`. During training, it is recommended to set
-        `return_covmat=False` to be compatible with the standard Keras pipelines
-        (e.g., `model.fit()`).
-
-    Returns:
-      logits: Uncertainty-adjusted predictive logits, shape
-        (batch_size, num_classes).
-      covmat: (Optional) Covariance matrix, shape (batch_size, batch_size).
-        Returned only when return_covmat=True.
-    """
-    logits = super().call(features)
-
-    # Extracts logits and covariance matrix from model output.
-    if self.use_gp_layer:
-      logits, covmat = logits
-    else:
-      covmat = None
-
-    # Computes the uncertainty-adjusted logits during evaluation.
-    if not training:
-      logits = gaussian_process.mean_field_logits(
-          logits, covmat, mean_field_factor=self.temperature)
-
-    if return_covmat and covmat is not None:
-      return logits, covmat
-    return logits
-
-  def reset_covariance_matrix(self):
-    """Resets covariance matrix of the Gaussian process layer."""
-    if hasattr(self.out_proj, "reset_covariance_matrix"):
-      self.out_proj.reset_covariance_matrix()
-
-  def get_config(self):
-    config = dict(
-        use_spec_norm=self.use_spec_norm, use_gp_layer=self.use_gp_layer)
-
-    config.update(self.spec_norm_kwargs)
-    config.update(self.gp_layer_kwargs)
-    config["temperature"] = self.temperature
-
-    config.update(super(GaussianProcessClassificationHead, self).get_config())
-    return config
-
-
-def extract_gp_layer_kwargs(kwargs):
-  """Extracts Gaussian process layer configs from a given kwarg."""
-
-  return dict(
-      num_inducing=kwargs.pop("num_inducing", 1024),
-      normalize_input=kwargs.pop("normalize_input", True),
-      gp_cov_momentum=kwargs.pop("gp_cov_momentum", 0.999),
-      gp_cov_ridge_penalty=kwargs.pop("gp_cov_ridge_penalty", 1.),
-      scale_random_features=kwargs.pop("scale_random_features", False),
-      l2_regularization=kwargs.pop("l2_regularization", 1e-6),
-      gp_cov_likelihood=kwargs.pop("gp_cov_likelihood", "gaussian"),
-      return_gp_cov=kwargs.pop("return_gp_cov", True),
-      return_random_features=kwargs.pop("return_random_features", False),
-      use_custom_random_features=kwargs.pop("use_custom_random_features", True),
-      custom_random_features_initializer=kwargs.pop(
-          "custom_random_features_initializer", "random_normal"),
-      custom_random_features_activation=kwargs.pop(
-          "custom_random_features_activation", None))
-
-
-def extract_spec_norm_kwargs(kwargs):
-  """Extracts spectral normalization configs from a given kwarg."""
-
-  return dict(
-      iteration=kwargs.pop("iteration", 1),
-      norm_multiplier=kwargs.pop("norm_multiplier", .99))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/cls_head_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/cls_head_test.py
deleted file mode 100644
index 646c028f07a0f0eb7d0dad506694015e309ed490..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/cls_head_test.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for cls_head."""
-from absl.testing import parameterized
-
-import tensorflow as tf
-
-from official.nlp.modeling.layers import cls_head
-
-
-class ClassificationHeadTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(("no_pooler_layer", 0, 2),
-                                  ("has_pooler_layer", 5, 4))
-  def test_pooler_layer(self, inner_dim, num_weights_expected):
-    test_layer = cls_head.ClassificationHead(inner_dim=inner_dim, num_classes=2)
-    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
-    _ = test_layer(features)
-
-    num_weights_observed = len(test_layer.get_weights())
-    self.assertEqual(num_weights_observed, num_weights_expected)
-
-  def test_layer_invocation(self):
-    test_layer = cls_head.ClassificationHead(inner_dim=5, num_classes=2)
-    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
-    output = test_layer(features)
-    self.assertAllClose(output, [[0., 0.], [0., 0.]])
-    self.assertSameElements(test_layer.checkpoint_items.keys(),
-                            ["pooler_dense"])
-
-  def test_layer_serialization(self):
-    layer = cls_head.ClassificationHead(10, 2)
-    new_layer = cls_head.ClassificationHead.from_config(layer.get_config())
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(layer.get_config(), new_layer.get_config())
-
-
-class MultiClsHeadsTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(("no_pooler_layer", 0, 4),
-                                  ("has_pooler_layer", 5, 6))
-  def test_pooler_layer(self, inner_dim, num_weights_expected):
-    cls_list = [("foo", 2), ("bar", 3)]
-    test_layer = cls_head.MultiClsHeads(inner_dim=inner_dim, cls_list=cls_list)
-    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
-    _ = test_layer(features)
-
-    num_weights_observed = len(test_layer.get_weights())
-    self.assertEqual(num_weights_observed, num_weights_expected)
-
-  def test_layer_invocation(self):
-    cls_list = [("foo", 2), ("bar", 3)]
-    test_layer = cls_head.MultiClsHeads(inner_dim=5, cls_list=cls_list)
-    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
-    outputs = test_layer(features)
-    self.assertAllClose(outputs["foo"], [[0., 0.], [0., 0.]])
-    self.assertAllClose(outputs["bar"], [[0., 0., 0.], [0., 0., 0.]])
-    self.assertSameElements(test_layer.checkpoint_items.keys(),
-                            ["pooler_dense", "foo", "bar"])
-
-  def test_layer_serialization(self):
-    cls_list = [("foo", 2), ("bar", 3)]
-    test_layer = cls_head.MultiClsHeads(inner_dim=5, cls_list=cls_list)
-    new_layer = cls_head.MultiClsHeads.from_config(test_layer.get_config())
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
-
-
-class GaussianProcessClassificationHead(tf.test.TestCase,
-                                        parameterized.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self.spec_norm_kwargs = dict(norm_multiplier=1.,)
-    self.gp_layer_kwargs = dict(num_inducing=512)
-
-  @parameterized.named_parameters(("no_pooler_layer", 0, 7),
-                                  ("has_pooler_layer", 5, 11))
-  def test_pooler_layer(self, inner_dim, num_weights_expected):
-    test_layer = cls_head.GaussianProcessClassificationHead(
-        inner_dim=inner_dim,
-        num_classes=2,
-        use_spec_norm=True,
-        use_gp_layer=True,
-        initializer="zeros",
-        **self.spec_norm_kwargs,
-        **self.gp_layer_kwargs)
-    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
-    _ = test_layer(features)
-
-    num_weights_observed = len(test_layer.get_weights())
-    self.assertEqual(num_weights_observed, num_weights_expected)
-
-  def test_layer_invocation(self):
-    test_layer = cls_head.GaussianProcessClassificationHead(
-        inner_dim=5,
-        num_classes=2,
-        use_spec_norm=True,
-        use_gp_layer=True,
-        initializer="zeros",
-        **self.spec_norm_kwargs,
-        **self.gp_layer_kwargs)
-    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
-    output = test_layer(features)
-    self.assertAllClose(output, [[0., 0.], [0., 0.]])
-    self.assertSameElements(test_layer.checkpoint_items.keys(),
-                            ["pooler_dense"])
-
-  @parameterized.named_parameters(
-      ("gp_layer_with_covmat", True, True),
-      ("gp_layer_no_covmat", True, False),
-      ("dense_layer_with_covmat", False, True),
-      ("dense_layer_no_covmat", False, False))
-  def test_sngp_output_shape(self, use_gp_layer, return_covmat):
-    batch_size = 32
-    num_classes = 2
-
-    test_layer = cls_head.GaussianProcessClassificationHead(
-        inner_dim=5,
-        num_classes=num_classes,
-        use_spec_norm=True,
-        use_gp_layer=use_gp_layer,
-        **self.spec_norm_kwargs,
-        **self.gp_layer_kwargs)
-
-    features = tf.zeros(shape=(batch_size, 10, 10), dtype=tf.float32)
-    outputs = test_layer(features, return_covmat=return_covmat)
-
-    if use_gp_layer and return_covmat:
-      self.assertIsInstance(outputs, tuple)
-      self.assertEqual(outputs[0].shape, (batch_size, num_classes))
-      self.assertEqual(outputs[1].shape, (batch_size, batch_size))
-    else:
-      self.assertIsInstance(outputs, tf.Tensor)
-      self.assertEqual(outputs.shape, (batch_size, num_classes))
-
-  def test_sngp_train_logits(self):
-    """Checks if temperature scaling is disabled during training."""
-    features = tf.zeros(shape=(5, 10, 10), dtype=tf.float32)
-
-    gp_layer = cls_head.GaussianProcessClassificationHead(
-        inner_dim=5, num_classes=2)
-
-    # Without temperature.
-    gp_layer.temperature = None
-    outputs_no_temp = gp_layer(features, training=True)
-
-    # With temperature.
-    gp_layer.temperature = 10.
-    outputs_with_temp = gp_layer(features, training=True)
-
-    self.assertAllEqual(outputs_no_temp, outputs_with_temp)
-
-  def test_layer_serialization(self):
-    layer = cls_head.GaussianProcessClassificationHead(
-        inner_dim=5,
-        num_classes=2,
-        use_spec_norm=True,
-        use_gp_layer=True,
-        **self.spec_norm_kwargs,
-        **self.gp_layer_kwargs)
-    new_layer = cls_head.GaussianProcessClassificationHead.from_config(
-        layer.get_config())
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(layer.get_config(), new_layer.get_config())
-
-  def test_sngp_kwargs_serialization(self):
-    """Tests if SNGP-specific kwargs are added during serialization."""
-    layer = cls_head.GaussianProcessClassificationHead(
-        inner_dim=5,
-        num_classes=2,
-        use_spec_norm=True,
-        use_gp_layer=True,
-        **self.spec_norm_kwargs,
-        **self.gp_layer_kwargs)
-    layer_config = layer.get_config()
-
-    # The config value should equal to those defined in setUp().
-    self.assertEqual(layer_config["norm_multiplier"], 1.)
-    self.assertEqual(layer_config["num_inducing"], 512)
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/dense_einsum.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/dense_einsum.py
deleted file mode 100644
index fe25084f6379e9243cead69b440a3fc3284e805e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/dense_einsum.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based einsum layer."""
-# pylint: disable=g-classes-have-attributes
-
-import tensorflow as tf
-
-from tensorflow.python.util import deprecation
-
-_CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class DenseEinsum(tf.keras.layers.Layer):
-  """A densely connected layer that uses `tf.einsum` as the backing computation.
-
-  This layer can perform einsum calculations of arbitrary dimensionality.
-
-  Args:
-    output_shape: Positive integer or tuple, dimensionality of the output space.
-    num_summed_dimensions: The number of dimensions to sum over. Standard 2D
-      matmul should use 1, 3D matmul should use 2, and so forth.
-    activation: Activation function to use. If you don't specify anything, no
-      activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    use_bias: Boolean, whether the layer uses a bias vector.
-    kernel_initializer: Initializer for the `kernel` weights matrix.
-    bias_initializer: Initializer for the bias vector.
-    kernel_regularizer: Regularizer function applied to the `kernel` weights
-      matrix.
-    bias_regularizer: Regularizer function applied to the bias vector.
-    activity_regularizer: Regularizer function applied to the output of the
-      layer (its "activation")..
-    kernel_constraint: Constraint function applied to the `kernel` weights
-      matrix.
-    bias_constraint: Constraint function applied to the bias vector.
-  Input shape:
-    N-D tensor with shape: `(batch_size, ..., input_dim)`. The most common
-      situation would be a 2D input with shape `(batch_size, input_dim)`.
-  Output shape:
-    N-D tensor with shape: `(batch_size, ..., units)`. For instance, for a 2D
-      input with shape `(batch_size, input_dim)`, the output would have shape
-      `(batch_size, units)`.
-  """
-
-  @deprecation.deprecated(None, "DenseEinsum is deprecated. Please use "
-                          "tf.keras.experimental.EinsumDense layer instead.")
-  def __init__(self,
-               output_shape,
-               num_summed_dimensions=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(DenseEinsum, self).__init__(**kwargs)
-    self._output_shape = output_shape if isinstance(
-        output_shape, (list, tuple)) else (output_shape,)
-    self._activation = tf.keras.activations.get(activation)
-    self._use_bias = use_bias
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
-    self._num_summed_dimensions = num_summed_dimensions
-    self._einsum_string = None
-
-  def _build_einsum_string(self, free_input_dims, bound_dims, output_dims):
-    input_str = ""
-    kernel_str = ""
-    output_str = ""
-    letter_offset = 0
-    for i in range(free_input_dims):
-      char = _CHR_IDX[i + letter_offset]
-      input_str += char
-      output_str += char
-
-    letter_offset += free_input_dims
-    for i in range(bound_dims):
-      char = _CHR_IDX[i + letter_offset]
-      input_str += char
-      kernel_str += char
-
-    letter_offset += bound_dims
-    for i in range(output_dims):
-      char = _CHR_IDX[i + letter_offset]
-      kernel_str += char
-      output_str += char
-
-    return input_str + "," + kernel_str + "->" + output_str
-
-  def build(self, input_shape):
-    input_shape = tf.TensorShape(input_shape)
-    input_rank = input_shape.rank
-    free_input_dims = input_rank - self._num_summed_dimensions
-    output_dims = len(self._output_shape)
-
-    self._einsum_string = self._build_einsum_string(free_input_dims,
-                                                    self._num_summed_dimensions,
-                                                    output_dims)
-
-    # This is only saved for testing purposes.
-    self._kernel_shape = (
-        input_shape[free_input_dims:].concatenate(self._output_shape))
-
-    self._kernel = self.add_weight(
-        "kernel",
-        shape=self._kernel_shape,
-        initializer=self._kernel_initializer,
-        regularizer=self._kernel_regularizer,
-        constraint=self._kernel_constraint,
-        dtype=self.dtype,
-        trainable=True)
-    if self._use_bias:
-      self._bias = self.add_weight(
-          "bias",
-          shape=self._output_shape,
-          initializer=self._bias_initializer,
-          regularizer=self._bias_regularizer,
-          constraint=self._bias_constraint,
-          dtype=self.dtype,
-          trainable=True)
-    else:
-      self._bias = None
-    super(DenseEinsum, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        "output_shape":
-            self._output_shape,
-        "num_summed_dimensions":
-            self._num_summed_dimensions,
-        "activation":
-            tf.keras.activations.serialize(self._activation),
-        "use_bias":
-            self._use_bias,
-        "kernel_initializer":
-            tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            tf.keras.constraints.serialize(self._bias_constraint)
-    }
-    base_config = super(DenseEinsum, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    ret = tf.einsum(self._einsum_string, inputs, self._kernel)
-    if self._use_bias:
-      ret += self._bias
-    if self._activation is not None:
-      ret = self._activation(ret)
-    return ret
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/dense_einsum_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/dense_einsum_test.py
deleted file mode 100644
index 1142963af0f505fd478674f3540d6bb82a74672a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/dense_einsum_test.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for Keras-based einsum layer."""
-
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import dense_einsum
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class DenseEinsumLayer(keras_parameterized.TestCase):
-
-  def test_3D_einsum_with_two_bound_dimensions(self):
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=(64,), num_summed_dimensions=2)
-    # Create a 4-dimensional input (the first dimension is implicit).
-    input_tensor = tf.keras.Input(shape=(None, 40, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(test_layer._einsum_string, "abcd,cde->abe")
-    self.assertEqual(test_layer._kernel_shape, (40, 80, 64))
-
-  def test_3D_einsum_with_one_bound_dimensions(self):
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=(64, 32), num_summed_dimensions=1)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(test_layer._einsum_string, "abc,cde->abde")
-    self.assertEqual(test_layer._kernel_shape, (80, 64, 32))
-
-  def test_2D_einsum_with_one_bound_dimensions(self):
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=(64,), num_summed_dimensions=1)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(test_layer._einsum_string, "abc,cd->abd")
-    self.assertEqual(test_layer._kernel_shape, (80, 64))
-
-  def test_bias_term_can_be_disabled(self):
-    # A layer created using the bias should have two weights.
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=64, num_summed_dimensions=1, use_bias=True)
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(2, len(test_layer.get_weights()))
-
-    # A layer created without the bias should have only one weight.
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=64, num_summed_dimensions=1, use_bias=False)
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(1, len(test_layer.get_weights()))
-
-  def test_activation(self):
-    # Create a model that does not use an activation.
-    no_activation_layer = dense_einsum.DenseEinsum(
-        output_shape=64, num_summed_dimensions=1, activation=None)
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    output_tensor = no_activation_layer(input_tensor)
-    no_activation_model = tf.keras.Model(input_tensor, output_tensor)
-
-    # Create a model that uses a softmax activation.
-    activation_layer = dense_einsum.DenseEinsum(
-        output_shape=64, num_summed_dimensions=1, activation="softmax")
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    output_tensor = activation_layer(input_tensor)
-    activation_model = tf.keras.Model(input_tensor, output_tensor)
-
-    # Make sure the models' weights are identical.
-    activation_model.set_weights(no_activation_model.get_weights())
-
-    # Predict using each model on the same input data. The output should be
-    # different, since one is using a softmax - even though the models' weights
-    # are the same.
-    input_values = 10 * np.random.random_sample((10, 4, 80))
-    non_activated_data = no_activation_model.predict(input_values)
-    activated_data = activation_model.predict(input_values)
-    self.assertNotAllClose(activated_data, non_activated_data)
-
-  def test_non_iterable_output_shape(self):
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=64, num_summed_dimensions=1)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    input_tensor = tf.keras.Input(shape=(None, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(test_layer._einsum_string, "abc,cd->abd")
-    self.assertEqual(test_layer._kernel_shape, (80, 64))
-
-  def test_with_explicit_initializer(self):
-    test_layer = dense_einsum.DenseEinsum(
-        output_shape=(64,),
-        num_summed_dimensions=2,
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
-    # Create a 4-dimensional input (the first dimension is implicit).
-    input_tensor = tf.keras.Input(shape=(None, 40, 80))
-    _ = test_layer(input_tensor)
-    self.assertEqual(test_layer._einsum_string, "abcd,cde->abe")
-    self.assertEqual(test_layer._kernel_shape, (40, 80, 64))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gated_feedforward.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gated_feedforward.py
deleted file mode 100644
index 9409479bedab297cfaf5c95e025b3cb8d6d73cd3..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gated_feedforward.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based gated feedforward layer."""
-# pylint: disable=g-classes-have-attributes
-
-import gin
-import tensorflow as tf
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-@gin.configurable
-class GatedFeedforward(tf.keras.layers.Layer):
-  """Gated linear feedforward layer.
-
-  This layer follows the paper "GLU Variants Improve Transformer"
-  (https://arxiv.org/abs/2002.05202). In additional, it allows to stack
-  multiple feedforward blocks and specify the position of dropout layer.
-
-  Args:
-    intermediate_size: Size of the intermediate layer.
-    intermediate_activation: Activation for the intermediate layer.
-    dropout: Dropout probability for the output dropout.
-    use_gate: Whether to use gated linear units. If True, assuming `GELU` as the
-      activation and omitting bias, will apply
-      `GEGLU(x, W, V, W_2) = (GEGLU(xW) * xV)W2`; if False, will follow
-      "Attention Is All You Need" (https://arxiv.org/abs/1706.03762) paper and
-        apply `FFN(x, W, W_2) = GELU(xW_1)W_2.`
-    num_blocks: The number of feedforward blocks to stack. Each block contains a
-      (gated) linear layer and a fully connected layer followed by dropout,
-      layer norm and residual.
-    dropout_position: Where to apply the dropout, the value can be either
-      `before_residual` or `after_residual`. If `before_residual`, will apply
-      `layer_output = layer_norm(dropout(layer_output) + layer_input)`; if
-      `after residual`, will apply
-      `layer_output = dropout(layer_norm(layer_output + layer_input))`.
-    kernel_initializer: Initializer for dense layer kernels.
-    bias_initializer: Initializer for dense layer biases.
-    kernel_regularizer: Regularizer for dense layer kernels.
-    bias_regularizer: Regularizer for dense layer biases.
-    activity_regularizer: Regularizer for dense layer activity.
-    kernel_constraint: Constraint for dense layer kernels.
-    bias_constraint: Constraint for dense layer kernels.
-  """
-
-  def __init__(self,
-               intermediate_size,
-               intermediate_activation,
-               dropout,
-               use_gate=True,
-               apply_output_layer_norm=True,
-               num_blocks=1,
-               dropout_position="before_residual",
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(GatedFeedforward, self).__init__(**kwargs)
-    self._intermediate_size = intermediate_size
-    self._intermediate_activation = intermediate_activation
-    self._dropout = dropout
-    self._use_gate = use_gate
-    self._num_blocks = num_blocks
-    self._apply_output_layer_norm = apply_output_layer_norm
-    self._dropout_position = dropout_position
-    if self._dropout_position not in ("before_residual", "after_residual"):
-      raise ValueError(
-          "The dropout_position should be either `before_residual` or"
-          "`after_residual`, got: %s" % self._dropout_position)
-
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
-
-  def build(self, input_shape):
-    hidden_size = input_shape.as_list()[-1]
-
-    common_kwargs = dict(
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint)
-    self._intermediate_dense = []
-    self._intermediate_activation_layers = []
-    self._gate_dense = []
-    self._output_dense = []
-    self._output_dropout = []
-    self._output_layer_norm = []
-    activation_policy = tf.keras.mixed_precision.global_policy()
-    if activation_policy.name == "mixed_bfloat16":
-      # bfloat16 causes BERT with the LAMB optimizer to not converge
-      # as well, so we use float32.
-      # TODO(b/154538392): Investigate this.
-      activation_policy = tf.float32
-    for i in range(self._num_blocks):
-      self._intermediate_dense.append(
-          tf.keras.layers.experimental.EinsumDense(
-              "abc,cd->abd",
-              output_shape=(None, self._intermediate_size),
-              bias_axes="d",
-              name="intermediate_%d" % i,
-              **common_kwargs))
-      self._intermediate_activation_layers.append(
-          tf.keras.layers.Activation(
-              self._intermediate_activation, dtype=activation_policy))
-      if self._use_gate:
-        self._gate_dense.append(
-            tf.keras.layers.experimental.EinsumDense(
-                "abc,cd->abd",
-                output_shape=(None, self._intermediate_size),
-                bias_axes="d",
-                name="gate_%d" % i,
-                **common_kwargs))
-      self._output_dense.append(
-          tf.keras.layers.experimental.EinsumDense(
-              "abc,cd->abd",
-              output_shape=(None, hidden_size),
-              bias_axes="d",
-              name="output_%d" % i,
-              **common_kwargs))
-      self._output_dropout.append(tf.keras.layers.Dropout(rate=self._dropout))
-      # Use float32 in layernorm for numeric stability.
-      if self._apply_output_layer_norm:
-        self._output_layer_norm.append(
-            tf.keras.layers.LayerNormalization(
-                name="output_layer_norm_%d" % i,
-                axis=-1,
-                epsilon=1e-12,
-                dtype=tf.float32))
-
-  def get_config(self):
-    config = {
-        "intermediate_size":
-            self._intermediate_size,
-        "intermediate_activation":
-            self._intermediate_activation,
-        "dropout":
-            self._dropout,
-        "use_gate":
-            self._use_gate,
-        "num_blocks":
-            self._num_blocks,
-        "dropout_position":
-            self._dropout_position,
-        "kernel_initializer":
-            tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            tf.keras.constraints.serialize(self._bias_constraint)
-    }
-    base_config = super(GatedFeedforward, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    layer_output = inputs
-    for i in range(self._num_blocks):
-      layer_input = layer_output
-      intermediate_output = self._intermediate_dense[i](layer_input)
-      intermediate_output = self._intermediate_activation_layers[i](
-          intermediate_output)
-      if self._use_gate:
-        gated_linear = self._gate_dense[i](layer_input)
-        intermediate_output = intermediate_output * gated_linear
-
-      layer_output = self._output_dense[i](intermediate_output)
-      if self._dropout_position == "before_residual":
-        layer_output = self._output_dropout[i](layer_output)
-
-      # During mixed precision training, `layer_input` may be from layer norm.
-      # If so, it is always fp32. Cast layer_output to fp32 for the subsequent
-      # add.
-      if layer_input.dtype == tf.float32:
-        layer_output = tf.cast(layer_output, tf.float32)
-      if self._apply_output_layer_norm:
-        layer_output = self._output_layer_norm[i](layer_output + layer_input)
-      if self._dropout_position == "after_residual":
-        layer_output = self._output_dropout[i](layer_output)
-
-    return layer_output
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gated_feedforward_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gated_feedforward_test.py
deleted file mode 100644
index 71d375ea2442b32f0f0d623e006464642722ce9b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gated_feedforward_test.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for Keras-based gated feedforward layer."""
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import gated_feedforward
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class GatedFeedforwardTest(keras_parameterized.TestCase):
-
-  def tearDown(self):
-    super(GatedFeedforwardTest, self).tearDown()
-    tf.keras.mixed_precision.set_global_policy("float32")
-
-  @parameterized.parameters(
-      (True, 1, "after_residual", "float32"),
-      (True, 1, "after_residual", "mixed_float16"),
-      (False, 4, "before_residual", "float32"),
-      (False, 4, "before_residual", "mixed_float16"),
-      (True, 4, "after_residual", "float32"),
-      (True, 4, "after_residual", "mixed_float16"),
-      (False, 1, "before_residual", "float32"),
-      (False, 1, "before_residual", "mixed_float16"),
-  )
-  def test_layer_creation(self, use_gate, num_blocks, dropout_position, dtype):
-    tf.keras.mixed_precision.set_global_policy(dtype)
-    kwargs = dict(
-        intermediate_size=128,
-        intermediate_activation="relu",
-        dropout=0.1,
-        use_gate=use_gate,
-        num_blocks=num_blocks,
-        dropout_position=dropout_position,
-        kernel_initializer="glorot_uniform",
-        bias_initializer="zeros")
-    test_layer = gated_feedforward.GatedFeedforward(**kwargs)
-
-    sequence_length = 64
-    width = 128
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(data_tensor)
-    # The default output of a transformer layer should be the same as the input.
-    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
-
-  @parameterized.parameters(
-      (True, 1, "after_residual", "float32"),
-      (True, 1, "after_residual", "mixed_float16"),
-      (False, 4, "before_residual", "float32"),
-      (False, 4, "before_residual", "mixed_float16"),
-      (True, 4, "after_residual", "float32"),
-      (True, 4, "after_residual", "mixed_float16"),
-      (False, 1, "before_residual", "float32"),
-      (False, 1, "before_residual", "mixed_float16"),
-  )
-  def test_layer_invocation(self, use_gate, num_blocks, dropout_position,
-                            dtype):
-    tf.keras.mixed_precision.set_global_policy(dtype)
-    kwargs = dict(
-        intermediate_size=16,
-        intermediate_activation="relu",
-        dropout=0.1,
-        use_gate=use_gate,
-        num_blocks=num_blocks,
-        dropout_position=dropout_position,
-        kernel_initializer="glorot_uniform",
-        bias_initializer="zeros")
-    test_layer = gated_feedforward.GatedFeedforward(**kwargs)
-
-    sequence_length = 16
-    width = 32
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(data_tensor)
-
-    # Create a model from the test layer.
-    model = tf.keras.Model(data_tensor, output_tensor)
-
-    # Invoke the model on test data.
-    batch_size = 6
-    input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, width))
-    output_data = model.predict(input_data)
-    self.assertEqual(output_data.shape, (batch_size, sequence_length, width))
-
-  def test_serialize_deserialize(self):
-    kwargs = dict(
-        intermediate_size=16,
-        intermediate_activation="relu",
-        dropout=0.1,
-        use_gate=False,
-        num_blocks=4,
-        dropout_position="after_residual",
-        kernel_initializer="glorot_uniform",
-        bias_initializer="zeros")
-    test_layer = gated_feedforward.GatedFeedforward(**kwargs)
-    new_layer = gated_feedforward.GatedFeedforward.from_config(
-        test_layer.get_config())
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gaussian_process.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gaussian_process.py
deleted file mode 100644
index 9db183a61c5baa67a7147e436a4eadba23677ca2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gaussian_process.py
+++ /dev/null
@@ -1,511 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Lint as: python3
-"""Definitions for random feature Gaussian process layer."""
-import math
-import tensorflow as tf
-
-
-_SUPPORTED_LIKELIHOOD = ('binary_logistic', 'poisson', 'gaussian')
-
-
-class RandomFeatureGaussianProcess(tf.keras.layers.Layer):
-  """Gaussian process layer with random feature approximation [1].
-
-  During training, the model updates the maximum a posteriori (MAP) logits
-  estimates and posterior precision matrix using minibatch statistics. During
-  inference, the model divides the MAP logit estimates by the predictive
-  standard deviation, which is equivalent to approximating the posterior mean
-  of the predictive probability via the mean-field approximation.
-
-  User can specify different types of random features by setting
-  `use_custom_random_features=True`, and change the initializer and activations
-  of the custom random features. For example:
-
-    MLP Kernel: initializer='random_normal', activation=tf.nn.relu
-    RBF Kernel: initializer='random_normal', activation=tf.math.cos
-
-  A linear kernel can also be specified by setting gp_kernel_type='linear' and
-  `use_custom_random_features=True`.
-
-  [1]: Ali Rahimi and Benjamin Recht. Random Features for Large-Scale Kernel
-       Machines. In _Neural Information Processing Systems_, 2007.
-       https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf
-
-  Attributes:
-    units: (int) The dimensionality of layer.
-    num_inducing: (int) The number of random features for the approximation.
-    is_training: (tf.bool) Whether the layer is set in training mode. If so the
-      layer updates the Gaussian process' variance estimate using statistics
-      computed from the incoming minibatches.
-  """
-
-  def __init__(self,
-               units,
-               num_inducing=1024,
-               gp_kernel_type='gaussian',
-               gp_kernel_scale=1.,
-               gp_output_bias=0.,
-               normalize_input=False,
-               gp_kernel_scale_trainable=False,
-               gp_output_bias_trainable=False,
-               gp_cov_momentum=0.999,
-               gp_cov_ridge_penalty=1.,
-               scale_random_features=True,
-               use_custom_random_features=True,
-               custom_random_features_initializer=None,
-               custom_random_features_activation=None,
-               l2_regularization=1e-6,
-               gp_cov_likelihood='gaussian',
-               return_gp_cov=True,
-               return_random_features=False,
-               dtype=None,
-               name='random_feature_gaussian_process',
-               **gp_output_kwargs):
-    """Initializes a random-feature Gaussian process layer instance.
-
-    Args:
-      units: (int) Number of output units.
-      num_inducing: (int) Number of random Fourier features used for
-        approximating the Gaussian process.
-      gp_kernel_type: (string) The type of kernel function to use for Gaussian
-        process. Currently default to 'gaussian' which is the Gaussian RBF
-        kernel.
-      gp_kernel_scale: (float) The length-scale parameter of the a
-        shift-invariant kernel function, i.e., for RBF kernel:
-        exp(-|x1 - x2|**2 / gp_kernel_scale).
-      gp_output_bias: (float) Scalar initial value for the bias vector.
-      normalize_input: (bool) Whether to normalize the input to Gaussian
-        process.
-      gp_kernel_scale_trainable: (bool) Whether the length scale variable is
-        trainable.
-      gp_output_bias_trainable: (bool) Whether the bias is trainable.
-      gp_cov_momentum: (float) A discount factor used to compute the moving
-        average for posterior covariance matrix.
-      gp_cov_ridge_penalty: (float) Initial Ridge penalty to posterior
-        covariance matrix.
-      scale_random_features: (bool) Whether to scale the random feature
-        by sqrt(2. / num_inducing).
-      use_custom_random_features: (bool) Whether to use custom random
-        features implemented using tf.keras.layers.Dense.
-      custom_random_features_initializer: (str or callable) Initializer for
-        the random features. Default to random normal which approximates a RBF
-        kernel function if activation function is cos.
-      custom_random_features_activation: (callable) Activation function for the
-        random feature layer. Default to cosine which approximates a RBF
-        kernel function.
-      l2_regularization: (float) The strength of l2 regularization on the output
-        weights.
-      gp_cov_likelihood: (string) Likelihood to use for computing Laplace
-        approximation for covariance matrix. Default to `gaussian`.
-      return_gp_cov: (bool) Whether to also return GP covariance matrix.
-        If False then no covariance learning is performed.
-      return_random_features: (bool) Whether to also return random features.
-      dtype: (tf.DType) Input data type.
-      name: (string) Layer name.
-      **gp_output_kwargs: Additional keyword arguments to dense output layer.
-    """
-    super(RandomFeatureGaussianProcess, self).__init__(name=name, dtype=dtype)
-    self.units = units
-    self.num_inducing = num_inducing
-
-    self.normalize_input = normalize_input
-    self.gp_input_scale = 1. / tf.sqrt(gp_kernel_scale)
-    self.gp_feature_scale = tf.sqrt(2. / float(num_inducing))
-
-    self.scale_random_features = scale_random_features
-    self.return_random_features = return_random_features
-    self.return_gp_cov = return_gp_cov
-
-    self.gp_kernel_type = gp_kernel_type
-    self.gp_kernel_scale = gp_kernel_scale
-    self.gp_output_bias = gp_output_bias
-    self.gp_kernel_scale_trainable = gp_kernel_scale_trainable
-    self.gp_output_bias_trainable = gp_output_bias_trainable
-
-    self.use_custom_random_features = use_custom_random_features
-    self.custom_random_features_initializer = custom_random_features_initializer
-    self.custom_random_features_activation = custom_random_features_activation
-
-    self.l2_regularization = l2_regularization
-    self.gp_output_kwargs = gp_output_kwargs
-
-    self.gp_cov_momentum = gp_cov_momentum
-    self.gp_cov_ridge_penalty = gp_cov_ridge_penalty
-    self.gp_cov_likelihood = gp_cov_likelihood
-
-    if self.use_custom_random_features:
-      # Default to Gaussian RBF kernel.
-      self.random_features_bias_initializer = tf.random_uniform_initializer(
-          minval=0., maxval=2. * math.pi)
-      if self.custom_random_features_initializer is None:
-        self.custom_random_features_initializer = (
-            tf.keras.initializers.RandomNormal(stddev=1.))
-      if self.custom_random_features_activation is None:
-        self.custom_random_features_activation = tf.math.cos
-
-  def build(self, input_shape):
-    # Defines model layers.
-    if self.normalize_input:
-      self._input_norm_layer = tf.keras.layers.LayerNormalization(
-          name='gp_input_normalization')
-      self._input_norm_layer.build(input_shape)
-      input_shape = self._input_norm_layer.compute_output_shape(input_shape)
-
-    self._random_feature = self._make_random_feature_layer(
-        name='gp_random_feature')
-    self._random_feature.build(input_shape)
-    input_shape = self._random_feature.compute_output_shape(input_shape)
-
-    if self.return_gp_cov:
-      self._gp_cov_layer = LaplaceRandomFeatureCovariance(
-          momentum=self.gp_cov_momentum,
-          ridge_penalty=self.gp_cov_ridge_penalty,
-          likelihood=self.gp_cov_likelihood,
-          dtype=self.dtype,
-          name='gp_covariance')
-      self._gp_cov_layer.build(input_shape)
-
-    self._gp_output_layer = tf.keras.layers.Dense(
-        units=self.units,
-        use_bias=False,
-        kernel_regularizer=tf.keras.regularizers.l2(self.l2_regularization),
-        dtype=self.dtype,
-        name='gp_output_weights',
-        **self.gp_output_kwargs)
-    self._gp_output_layer.build(input_shape)
-
-    self._gp_output_bias = tf.Variable(
-        initial_value=[self.gp_output_bias] * self.units,
-        dtype=self.dtype,
-        trainable=self.gp_output_bias_trainable,
-        name='gp_output_bias')
-
-    self.built = True
-
-  def _make_random_feature_layer(self, name):
-    """Defines random feature layer depending on kernel type."""
-    if not self.use_custom_random_features:
-      # Use default RandomFourierFeatures layer from tf.keras.
-      return tf.keras.layers.experimental.RandomFourierFeatures(
-          output_dim=self.num_inducing,
-          kernel_initializer=self.gp_kernel_type,
-          scale=self.gp_kernel_scale,
-          trainable=self.gp_kernel_scale_trainable,
-          dtype=self.dtype,
-          name=name)
-
-    if self.gp_kernel_type.lower() == 'linear':
-      custom_random_feature_layer = tf.keras.layers.Lambda(
-          lambda x: x, name=name)
-    else:
-      # Use user-supplied configurations.
-      custom_random_feature_layer = tf.keras.layers.Dense(
-          units=self.num_inducing,
-          use_bias=True,
-          activation=self.custom_random_features_activation,
-          kernel_initializer=self.custom_random_features_initializer,
-          bias_initializer=self.random_features_bias_initializer,
-          trainable=False,
-          name=name)
-
-    return custom_random_feature_layer
-
-  def reset_covariance_matrix(self):
-    """Resets covariance matrix of the GP layer.
-
-    This function is useful for reseting the model's covariance matrix at the
-    begining of a new epoch.
-    """
-    self._gp_cov_layer.reset_precision_matrix()
-
-  def call(self, inputs, global_step=None, training=None):
-    # Computes random features.
-    gp_inputs = inputs
-    if self.normalize_input:
-      gp_inputs = self._input_norm_layer(gp_inputs)
-    elif self.use_custom_random_features:
-      # Supports lengthscale for custom random feature layer by directly
-      # rescaling the input.
-      gp_input_scale = tf.cast(self.gp_input_scale, inputs.dtype)
-      gp_inputs = gp_inputs * gp_input_scale
-
-    gp_feature = self._random_feature(gp_inputs)
-
-    if self.scale_random_features:
-      # Scale random feature by 2. / sqrt(num_inducing) following [1].
-      # When using GP layer as the output layer of a nerual network,
-      # it is recommended to turn this scaling off to prevent it from changing
-      # the learning rate to the hidden layers.
-      gp_feature_scale = tf.cast(self.gp_feature_scale, inputs.dtype)
-      gp_feature = gp_feature * gp_feature_scale
-
-    # Computes posterior center (i.e., MAP estimate) and variance.
-    gp_output = self._gp_output_layer(gp_feature) + self._gp_output_bias
-
-    if self.return_gp_cov:
-      gp_covmat = self._gp_cov_layer(gp_feature, gp_output, training)
-
-    # Assembles model output.
-    model_output = [gp_output,]
-    if self.return_gp_cov:
-      model_output.append(gp_covmat)
-    if self.return_random_features:
-      model_output.append(gp_feature)
-
-    return model_output
-
-
-class LaplaceRandomFeatureCovariance(tf.keras.layers.Layer):
-  """Computes the Gaussian Process covariance using Laplace method.
-
-  At training time, this layer updates the Gaussian process posterior using
-  model features in minibatches.
-
-  Attributes:
-    momentum: (float) A discount factor used to compute the moving average for
-      posterior precision matrix. Analogous to the momentum factor in batch
-      normalization. If -1 then update covariance matrix using a naive sum
-      without momentum, which is desirable if the goal is to compute the exact
-      covariance matrix by passing through data once (say in the final epoch).
-    ridge_penalty: (float) Initial Ridge penalty to weight covariance matrix.
-      This value is used to stablize the eigenvalues of weight covariance
-      estimate so that the matrix inverse can be computed for Cov = inv(t(X) * X
-      + s * I). The ridge factor s cannot be too large since otherwise it will
-      dominate the t(X) * X term and make covariance estimate not meaningful.
-    likelihood: (str) The likelihood to use for computing Laplace approximation
-      for the covariance matrix. Can be one of ('binary_logistic', 'poisson',
-      'gaussian').
-  """
-
-  def __init__(self,
-               momentum=0.999,
-               ridge_penalty=1.,
-               likelihood='gaussian',
-               dtype=None,
-               name='laplace_covariance'):
-    if likelihood not in _SUPPORTED_LIKELIHOOD:
-      raise ValueError(
-          f'"likelihood" must be one of {_SUPPORTED_LIKELIHOOD}, got {likelihood}.'
-      )
-    self.ridge_penalty = ridge_penalty
-    self.momentum = momentum
-    self.likelihood = likelihood
-    super(LaplaceRandomFeatureCovariance, self).__init__(dtype=dtype, name=name)
-
-  def compute_output_shape(self, input_shape):
-    gp_feature_dim = input_shape[-1]
-    return tf.TensorShape([gp_feature_dim, gp_feature_dim])
-
-  def build(self, input_shape):
-    gp_feature_dim = input_shape[-1]
-
-    # Convert gp_feature_dim to int value for TF1 compatibility.
-    if isinstance(gp_feature_dim, tf.compat.v1.Dimension):
-      gp_feature_dim = gp_feature_dim.value
-
-    # Posterior precision matrix for the GP's random feature coefficients.
-    self.initial_precision_matrix = (
-        self.ridge_penalty * tf.eye(gp_feature_dim, dtype=self.dtype))
-
-    self.precision_matrix = (
-        self.add_weight(
-            name='gp_precision_matrix',
-            shape=(gp_feature_dim, gp_feature_dim),
-            dtype=self.dtype,
-            initializer=tf.keras.initializers.Identity(self.ridge_penalty),
-            trainable=False,
-            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA))
-    self.built = True
-
-  def make_precision_matrix_update_op(self,
-                                      gp_feature,
-                                      logits,
-                                      precision_matrix):
-    """Defines update op for the precision matrix of feature weights."""
-    if self.likelihood != 'gaussian':
-      if logits is None:
-        raise ValueError(
-            f'"logits" cannot be None when likelihood={self.likelihood}')
-
-      if logits.shape[-1] != 1:
-        raise ValueError(
-            f'likelihood={self.likelihood} only support univariate logits.'
-            f'Got logits dimension: {logits.shape[-1]}')
-
-    batch_size = tf.shape(gp_feature)[0]
-    batch_size = tf.cast(batch_size, dtype=gp_feature.dtype)
-
-    # Computes batch-specific normalized precision matrix.
-    if self.likelihood == 'binary_logistic':
-      prob = tf.sigmoid(logits)
-      prob_multiplier = prob * (1. - prob)
-    elif self.likelihood == 'poisson':
-      prob_multiplier = tf.exp(logits)
-    else:
-      prob_multiplier = 1.
-
-    gp_feature_adjusted = tf.sqrt(prob_multiplier) * gp_feature
-    precision_matrix_minibatch = tf.matmul(
-        gp_feature_adjusted, gp_feature_adjusted, transpose_a=True)
-
-    # Updates the population-wise precision matrix.
-    if self.momentum > 0:
-      # Use moving-average updates to accumulate batch-specific precision
-      # matrices.
-      precision_matrix_minibatch = precision_matrix_minibatch / batch_size
-      precision_matrix_new = (
-          self.momentum * precision_matrix +
-          (1. - self.momentum) * precision_matrix_minibatch)
-    else:
-      # Compute exact population-wise covariance without momentum.
-      # If use this option, make sure to pass through data only once.
-      precision_matrix_new = precision_matrix + precision_matrix_minibatch
-
-    # Returns the update op.
-    return precision_matrix.assign(precision_matrix_new)
-
-  def reset_precision_matrix(self):
-    """Resets precision matrix to its initial value.
-
-    This function is useful for reseting the model's covariance matrix at the
-    begining of a new epoch.
-    """
-    precision_matrix_reset_op = self.precision_matrix.assign(
-        self.initial_precision_matrix)
-    self.add_update(precision_matrix_reset_op)
-
-  def compute_predictive_covariance(self, gp_feature):
-    """Computes posterior predictive variance.
-
-    Approximates the Gaussian process posterior using random features.
-    Given training random feature Phi_tr (num_train, num_hidden) and testing
-    random feature Phi_ts (batch_size, num_hidden). The predictive covariance
-    matrix is computed as (assuming Gaussian likelihood):
-
-    s * Phi_ts @ inv(t(Phi_tr) * Phi_tr + s * I) @ t(Phi_ts),
-
-    where s is the ridge factor to be used for stablizing the inverse, and I is
-    the identity matrix with shape (num_hidden, num_hidden).
-
-    Args:
-      gp_feature: (tf.Tensor) The random feature of testing data to be used for
-        computing the covariance matrix. Shape (batch_size, gp_hidden_size).
-
-    Returns:
-      (tf.Tensor) Predictive covariance matrix, shape (batch_size, batch_size).
-    """
-    # Computes the covariance matrix of the feature coefficient.
-    feature_cov_matrix = tf.linalg.inv(self.precision_matrix)
-
-    # Computes the covariance matrix of the gp prediction.
-    cov_feature_product = tf.matmul(
-        feature_cov_matrix, gp_feature, transpose_b=True) * self.ridge_penalty
-    gp_cov_matrix = tf.matmul(gp_feature, cov_feature_product)
-    return gp_cov_matrix
-
-  def _get_training_value(self, training=None):
-    if training is None:
-      training = tf.keras.backend.learning_phase()
-
-    if isinstance(training, int):
-      training = bool(training)
-
-    return training
-
-  def call(self, inputs, logits=None, training=None):
-    """Minibatch updates the GP's posterior precision matrix estimate.
-
-    Args:
-      inputs: (tf.Tensor) GP random features, shape (batch_size,
-        gp_hidden_size).
-      logits: (tf.Tensor) Pre-activation output from the model. Needed
-        for Laplace approximation under a non-Gaussian likelihood.
-      training: (tf.bool) whether or not the layer is in training mode. If in
-        training mode, the gp_weight covariance is updated using gp_feature.
-
-    Returns:
-      gp_stddev (tf.Tensor): GP posterior predictive variance,
-        shape (batch_size, batch_size).
-    """
-    batch_size = tf.shape(inputs)[0]
-    training = self._get_training_value(training)
-
-    if training:
-      # Define and register the update op for feature precision matrix.
-      precision_matrix_update_op = self.make_precision_matrix_update_op(
-          gp_feature=inputs,
-          logits=logits,
-          precision_matrix=self.precision_matrix)
-      self.add_update(precision_matrix_update_op)
-      # Return null estimate during training.
-      return tf.eye(batch_size, dtype=self.dtype)
-    else:
-      # Return covariance estimate during inference.
-      return self.compute_predictive_covariance(gp_feature=inputs)
-
-
-def mean_field_logits(logits, covariance_matrix=None, mean_field_factor=1.):
-  """Adjust the model logits so its softmax approximates the posterior mean [1].
-
-  [1]: Zhiyun Lu, Eugene Ie, Fei Sha. Uncertainty Estimation with Infinitesimal
-       Jackknife.  _arXiv preprint arXiv:2006.07584_, 2020.
-       https://arxiv.org/abs/2006.07584
-
-  Arguments:
-    logits: A float tensor of shape (batch_size, num_classes).
-    covariance_matrix: The covariance matrix of shape (batch_size, batch_size).
-      If None then it assumes the covariance_matrix is an identity matrix.
-    mean_field_factor: The scale factor for mean-field approximation, used to
-      adjust the influence of posterior variance in posterior mean
-      approximation. If covariance_matrix=None then it is used as the
-      temperature parameter for temperature scaling.
-
-  Returns:
-    Tensor of adjusted logits, shape (batch_size, num_classes).
-  """
-  if mean_field_factor is None or mean_field_factor < 0:
-    return logits
-
-  # Compute standard deviation.
-  if covariance_matrix is None:
-    variances = 1.
-  else:
-    variances = tf.linalg.diag_part(covariance_matrix)
-
-  # Compute scaling coefficient for mean-field approximation.
-  logits_scale = tf.sqrt(1. + variances * mean_field_factor)
-
-  if len(logits.shape) > 1:
-    # Cast logits_scale to compatible dimension.
-    logits_scale = tf.expand_dims(logits_scale, axis=-1)
-
-  return logits / logits_scale
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gaussian_process_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gaussian_process_test.py
deleted file mode 100644
index e26d48f8c5bcdb38d1efd13b124f3b64d4bad32a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/gaussian_process_test.py
+++ /dev/null
@@ -1,284 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Lint as: python3
-"""Tests for Gaussian process functions."""
-import os
-import shutil
-
-from absl.testing import parameterized
-
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.modeling.layers import gaussian_process
-
-
-def exact_gaussian_kernel(x1, x2):
-  """Computes exact Gaussian kernel value(s) for tensors x1 and x2."""
-  x1_squared = tf.reduce_sum(tf.square(x1), list(range(1, len(x1.shape))))
-  x2_squared = tf.reduce_sum(tf.square(x2), list(range(1, len(x2.shape))))
-  square = (x1_squared[:, tf.newaxis] + x2_squared[tf.newaxis, :] -
-            2 * tf.matmul(x1, x2, transpose_b=True))
-  return tf.math.exp(-square / 2.)
-
-
-def _generate_normal_data(num_sample, num_dim, loc):
-  """Generates random data sampled from i.i.d. normal distribution."""
-  return np.random.normal(
-      size=(num_sample, num_dim), loc=loc, scale=1. / np.sqrt(num_dim))
-
-
-def _generate_rbf_data(x_data, orthogonal=True):
-  """Generates high-dim data that is the eigen components of a RBF kernel."""
-  k_rbf = exact_gaussian_kernel(x_data, x_data)
-  x_orth, x_diag, _ = np.linalg.svd(k_rbf)
-  if orthogonal:
-    return x_orth
-  return np.diag(np.sqrt(x_diag)).dot(x_orth.T)
-
-
-def _make_minibatch_iterator(data_numpy, batch_size, num_epoch):
-  """Makes a tf.data.Dataset for given batch size and num epoches."""
-  dataset = tf.data.Dataset.from_tensor_slices(data_numpy)
-  dataset = dataset.repeat(num_epoch).batch(batch_size)
-  return iter(dataset)
-
-
-def _compute_posterior_kernel(x_tr, x_ts, kernel_func, ridge_penalty):
-  """Computes the posterior covariance matrix of a Gaussian process."""
-  num_sample = x_tr.shape[0]
-
-  k_tt_inv = tf.linalg.inv(
-      kernel_func(x_tr, x_tr) + ridge_penalty * np.eye(num_sample))
-  k_ts = kernel_func(x_tr, x_ts)
-  k_ss = kernel_func(x_ts, x_ts)
-
-  return k_ss - tf.matmul(k_ts, tf.matmul(k_tt_inv, k_ts), transpose_a=True)
-
-
-class GaussianProcessTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(GaussianProcessTest, self).setUp()
-    self.num_data_dim = 10
-    self.num_inducing = 1024
-    self.num_train_sample = 1024
-    self.num_test_sample = 256
-    self.prec_tolerance = {'atol': 1e-3, 'rtol': 5e-2}
-    self.cov_tolerance = {'atol': 5e-2, 'rtol': 2.}
-
-    self.rbf_kern_func = exact_gaussian_kernel
-
-    self.x_tr = _generate_normal_data(
-        self.num_train_sample, self.num_data_dim, loc=0.)
-    self.x_ts = _generate_normal_data(
-        self.num_test_sample, self.num_data_dim, loc=1.)
-
-  def test_layer_build(self):
-    """Tests if layer.built=True after building."""
-    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(units=1)
-    rfgp_model.build(input_shape=self.x_tr.shape)
-
-    self.assertTrue(rfgp_model.built)
-
-  @parameterized.named_parameters(('rbf_data', False),
-                                  ('orthogonal_data', True))
-  def test_laplace_covariance_minibatch(self, generate_orthogonal_data):
-    """Tests if model correctly learns population-lvel precision matrix."""
-    batch_size = 50
-    epochs = 1000
-    x_data = _generate_rbf_data(self.x_ts, generate_orthogonal_data)
-    data_iterator = _make_minibatch_iterator(x_data, batch_size, epochs)
-
-    # Estimates precision matrix using minibatch.
-    cov_estimator = gaussian_process.LaplaceRandomFeatureCovariance(
-        momentum=0.999, ridge_penalty=0)
-
-    for minibatch_data in data_iterator:
-      _ = cov_estimator(minibatch_data, training=True)
-
-    # Evaluation
-    prec_mat_expected = x_data.T.dot(x_data)
-    prec_mat_computed = (
-        cov_estimator.precision_matrix.numpy() * self.num_test_sample)
-
-    np.testing.assert_allclose(prec_mat_computed, prec_mat_expected,
-                               **self.prec_tolerance)
-
-  def test_random_feature_prior_approximation(self):
-    """Tests random feature GP's ability in approximating exact GP prior."""
-    num_inducing = 10240
-    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(
-        units=1,
-        num_inducing=num_inducing,
-        normalize_input=False,
-        gp_kernel_type='gaussian',
-        return_random_features=True)
-
-    # Extract random features.
-    _, _, gp_feature = rfgp_model(self.x_tr, training=True)
-    gp_feature_np = gp_feature.numpy()
-
-    prior_kernel_computed = gp_feature_np.dot(gp_feature_np.T)
-    prior_kernel_expected = self.rbf_kern_func(self.x_tr, self.x_tr)
-    np.testing.assert_allclose(prior_kernel_computed, prior_kernel_expected,
-                               **self.cov_tolerance)
-
-  def test_random_feature_posterior_approximation(self):
-    """Tests random feature GP's ability in approximating exact GP posterior."""
-    # Set momentum = 0.5 so posterior precision matrix is 0.5 * (I + K).
-    gp_cov_momentum = 0.5
-    gp_cov_ridge_penalty = 1.
-    num_inducing = 1024
-
-    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(
-        units=1,
-        num_inducing=num_inducing,
-        normalize_input=False,
-        gp_kernel_type='gaussian',
-        gp_cov_momentum=gp_cov_momentum,
-        gp_cov_ridge_penalty=gp_cov_ridge_penalty)
-
-    # Computes posterior covariance on test data.
-    _, _ = rfgp_model(self.x_tr, training=True)
-    _, gp_cov_ts = rfgp_model(self.x_ts, training=False)
-
-    # Scale up covariance estimate since prec matrix is down-scaled by momentum.
-    post_kernel_computed = gp_cov_ts * gp_cov_momentum
-    post_kernel_expected = _compute_posterior_kernel(self.x_tr, self.x_ts,
-                                                     self.rbf_kern_func,
-                                                     gp_cov_ridge_penalty)
-    np.testing.assert_allclose(post_kernel_computed, post_kernel_expected,
-                               **self.cov_tolerance)
-
-  def test_random_feature_linear_kernel(self):
-    """Tests if linear kernel indeed leads to an identity mapping."""
-    # Specify linear kernel
-    gp_kernel_type = 'linear'
-    normalize_input = False
-    scale_random_features = False
-    use_custom_random_features = True
-
-    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(
-        units=1,
-        normalize_input=normalize_input,
-        gp_kernel_type=gp_kernel_type,
-        scale_random_features=scale_random_features,
-        use_custom_random_features=use_custom_random_features,
-        return_random_features=True)
-
-    _, _, gp_feature = rfgp_model(self.x_tr, training=True)
-
-    # Check if linear kernel leads to identity mapping.
-    np.testing.assert_allclose(gp_feature, self.x_tr, **self.prec_tolerance)
-
-  def test_no_matrix_update_during_test(self):
-    """Tests if the precision matrix is not updated during testing."""
-    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(units=1)
-
-    # Training.
-    _, gp_covmat_null = rfgp_model(self.x_tr, training=True)
-    precision_mat_before_test = rfgp_model._gp_cov_layer.precision_matrix
-
-    # Testing.
-    _ = rfgp_model(self.x_ts, training=False)
-    precision_mat_after_test = rfgp_model._gp_cov_layer.precision_matrix
-
-    self.assertAllClose(
-        gp_covmat_null, tf.eye(self.num_train_sample), atol=1e-4)
-    self.assertAllClose(
-        precision_mat_before_test, precision_mat_after_test, atol=1e-4)
-
-  def test_state_saving_and_loading(self):
-    """Tests if the loaded model returns same results."""
-    input_data = np.random.random((1, 2))
-    rfgp_model = gaussian_process.RandomFeatureGaussianProcess(units=1)
-
-    inputs = tf.keras.Input((2,), batch_size=1)
-    outputs = rfgp_model(inputs)
-    model = tf.keras.Model(inputs, outputs)
-    gp_output, gp_covmat = model.predict(input_data)
-
-    # Save and then load the model.
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir)
-    saved_model_dir = os.path.join(temp_dir, 'rfgp_model')
-    model.save(saved_model_dir)
-    new_model = tf.keras.models.load_model(saved_model_dir)
-
-    gp_output_new, gp_covmat_new = new_model.predict(input_data)
-    self.assertAllClose(gp_output, gp_output_new, atol=1e-4)
-    self.assertAllClose(gp_covmat, gp_covmat_new, atol=1e-4)
-
-
-class MeanFieldLogitsTest(tf.test.TestCase):
-
-  def testMeanFieldLogitsLikelihood(self):
-    """Tests if scaling is correct under different likelihood."""
-    batch_size = 10
-    num_classes = 12
-    variance = 1.5
-    mean_field_factor = 2.
-
-    rng = np.random.RandomState(0)
-    tf.random.set_seed(1)
-    logits = rng.randn(batch_size, num_classes)
-    covmat = tf.linalg.diag([variance] * batch_size)
-
-    logits_logistic = gaussian_process.mean_field_logits(
-        logits, covmat, mean_field_factor=mean_field_factor)
-
-    self.assertAllClose(logits_logistic, logits / 2., atol=1e-4)
-
-  def testMeanFieldLogitsTemperatureScaling(self):
-    """Tests using mean_field_logits as temperature scaling method."""
-    batch_size = 10
-    num_classes = 12
-
-    rng = np.random.RandomState(0)
-    tf.random.set_seed(1)
-    logits = rng.randn(batch_size, num_classes)
-
-    # Test if there's no change to logits when mean_field_factor < 0.
-    logits_no_change = gaussian_process.mean_field_logits(
-        logits, covariance_matrix=None, mean_field_factor=-1)
-
-    # Test if mean_field_logits functions as a temperature scaling method when
-    # mean_field_factor > 0, with temperature = sqrt(1. + mean_field_factor).
-    logits_scale_by_two = gaussian_process.mean_field_logits(
-        logits, covariance_matrix=None, mean_field_factor=3.)
-
-    self.assertAllClose(logits_no_change, logits, atol=1e-4)
-    self.assertAllClose(logits_scale_by_two, logits / 2., atol=1e-4)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/kernel_attention.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/kernel_attention.py
deleted file mode 100644
index 63992de2fbe45f1d3a9a46d5b0df6cc36b89774f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/kernel_attention.py
+++ /dev/null
@@ -1,412 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based kernel attention layer."""
-
-import functools
-import math
-import tensorflow as tf
-
-_NUMERIC_STABLER = 1e-6
-
-
-class KernelMask(tf.keras.layers.Layer):
-  """Creates kernel attention mask.
-
-    inputs: from_tensor: 2D or 3D Tensor of shape
-      [batch_size, from_seq_length, ...].
-    mask: a Tensor of shape [batch_size, from_seq_length] which indicates
-      which part of the inputs we should not attend.
-
-    Returns:
-      float Tensor of shape [batch_size, from_seq_length] that KernelAttention
-      takes as mask.
-  """
-
-  def call(self, inputs, mask):
-    mask = tf.cast(mask, inputs.dtype)
-    return mask
-
-
-def create_projection_matrix(m, d, seed=None):
-  r"""Constructs the matrix of random projections.
-
-  Constructs a matrix of random orthogonal projections. Each projection vector
-  has direction chosen uniformly at random length taken from the
-  \chi(d) distribution.).
-
-  Args:
-    m: number of random projections.
-    d: dimensionality of each random projection.
-    seed: random seed used to construct projections. If not, we use the stateful
-      api.
-
-  Returns:
-    The matrix of random projections of the shape [m, d].
-  """
-  nb_full_blocks = math.ceil(m / d)
-  block_list = tf.TensorArray(tf.float32,
-                              size=tf.cast(nb_full_blocks, dtype=tf.int32))
-  stateful = False
-  if seed is None:
-    stateful = True
-    # dummy seed to make sure the graph compiles though the path is not taken.
-    seed = tf.constant([0, 1])
-  current_seed = seed
-  for i in range(nb_full_blocks):
-    if stateful:
-      unstructured_block = tf.random.normal((d, d))
-    else:
-      unstructured_block = tf.random.stateless_normal((d, d), seed=current_seed)
-      current_seed = tf.random.stateless_uniform([2],
-                                                 seed=current_seed,
-                                                 minval=None,
-                                                 dtype=tf.int32)
-    q, _ = tf.linalg.qr(unstructured_block)
-    q = tf.transpose(q)
-    block_list = block_list.write(i, q)
-  final_matrix = block_list.concat()[:m]
-  if stateful is None:
-    multiplier = tf.norm(tf.random.normal((m, d)), axis=1)
-  else:
-    multiplier = tf.norm(
-        tf.random.stateless_normal((m, d), seed=current_seed), axis=1)
-  return tf.linalg.matmul(tf.linalg.diag(multiplier), final_matrix)
-
-
-def _generalized_kernel(x, projection_matrix, is_query, f, h,
-                        data_normalizer_fn=None):
-  """Generalized kernel in RETHINKING ATTENTION WITH PERFORMERS.
-
-  Args:
-    x: The feature being transformed with shape [B, T, N ,H].
-    projection_matrix: The matrix with shape [M, H] that we projecct x to, where
-      M is the number of projections.
-    is_query: Whether the transform is a query or key. This transform is
-      symmetric is the argument is not used.
-    f: A non-linear function applied on x or projected x.
-    h: A muliplier which is a function of x applied after projected and
-      transformed. Only applied if projection_matrix is not None.
-    data_normalizer_fn: A function which takes x and returns a scalar that
-      normalize data.
-
-  Returns:
-    Transformed feature.
-  """
-  # No asymmetric operations.
-  del is_query
-
-  if data_normalizer_fn is not None:
-    x = data_normalizer_fn(x)
-
-  if projection_matrix is None:
-    return h(x) * f(x)
-  else:
-    x_projected = tf.einsum("BTNH,MH->BTNM", x, projection_matrix)
-    return h(x) * f(x_projected) / tf.math.sqrt(
-        tf.cast(tf.shape(projection_matrix)[0], tf.float32))
-
-
-# pylint: disable=g-long-lambda
-_TRANSFORM_MAP = {
-    "elu":
-        functools.partial(
-            _generalized_kernel,
-            f=lambda x: tf.keras.activations.elu(x) + 1,
-            h=lambda x: 1),
-    "relu":
-        functools.partial(
-            _generalized_kernel, f=tf.keras.activations.relu, h=lambda x: 1),
-    "square":
-        functools.partial(
-            _generalized_kernel, f=tf.math.square, h=lambda x: 1),
-    "exp":
-        functools.partial(
-            _generalized_kernel,
-            # Avoid exp explosion by shifting.
-            f=lambda x: tf.math.exp(
-                x - tf.math.reduce_max(x, axis=[1, 2, 3], keepdims=True)),
-            h=lambda x: tf.math.exp(
-                -0.5 * tf.math.reduce_sum(
-                    tf.math.square(x), axis=-1, keepdims=True)),
-            data_normalizer_fn=lambda x: x /
-            (tf.math.sqrt(tf.math.sqrt(tf.cast(tf.shape(x)[-1], tf.float32))))),
-    "expmod":
-        functools.partial(
-            _generalized_kernel,
-            # Avoid exp explosion by shifting.
-            f=lambda x: tf.math.exp(
-                x - tf.math.reduce_max(x, axis=[1, 2, 3], keepdims=True)),
-            h=lambda x: tf.math.exp(
-                -0.5 * tf.math.sqrt(tf.cast(tf.shape(x)[-1], tf.float32))),
-            data_normalizer_fn=lambda x: x /
-            (tf.math.sqrt(tf.math.sqrt(tf.cast(tf.shape(x)[-1], tf.float32))))),
-    "l2":
-        functools.partial(
-            _generalized_kernel,
-            f=lambda x: x,
-            h=lambda x: tf.math.sqrt(tf.cast(tf.shape(x)[-1], tf.float32)),
-            data_normalizer_fn=lambda x: x),
-    "identity": lambda x, projection_matrix, is_query: x
-}
-# pylint: enable=g-long-lambda
-
-
-class KernelAttention(tf.keras.layers.MultiHeadAttention):
-  """A variant of efficient transformers which replaces softmax with kernels.
-
-  This module combines ideas from the two following papers:
-
-  Rethinking Attention with Performers
-  (https://arxiv.org/abs/2009.14794)
-  - exp (Lemma 1, positive), relu, l2
-  - random/deterministic projection
-
-  Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention
-  (https://arxiv.org/abs/2006.16236)
-  - elu
-
-  with the theory of approximating angular Performer kernels from go/performer.
-
-  The module enables computing efficient attention in both: long sequence and
-  shorter sequence regimes. In the former setting, the attention matrix is never
-  explicitly computed and instead its low-rank decomposition obtained with given
-  kernel feature maps is leveraged to conduct attention module calculations
-  (see: https://arxiv.org/abs/2006.16236). In the latter setting, attention
-  matrix is constructed, but kernel features providing dimensionality reduction
-  are applied, resulting in more efficient computation of the attention matrix.
-  """
-
-  def __init__(self,
-               feature_transform="exp",
-               num_random_features=256,
-               seed=0,
-               redraw=False,
-               is_short_seq=False,
-               begin_kernel=0,
-               **kwargs):
-    r"""Constructor of KernelAttention.
-
-    Args:
-      feature_transform: A non-linear transform of the keys and quries.
-      Possible transforms are "elu", "relu", "square", "exp", "expmod",
-      "l2", "identity". If <is_short_seq> = True, it is recommended to choose
-      feature_transform as "l2".
-      num_random_features: Number of random features to be used for projection.
-        if num_random_features <= 0, no production is used before transform.
-      seed: The seed to begin drawing random features. Once the seed is set, the
-        psedo number generation is determinisitc. Users should pass different
-        seed for different layers. For multi-worker, each layer will use the
-        same projection at each step.
-      redraw: Whether to redraw projection every forward pass during training.
-        The argument is only effective when num_random_features > 0.
-      is_short_seq: boolean predicate indicating whether input data consists of
-        very short sequences or not; in most cases this should be False
-        (default option).
-      begin_kernel: Apply kernel_attention after this sequence id and apply
-        softmax attention before this.
-      **kwargs: The same arguments `MultiHeadAttention` layer.
-    """
-    if feature_transform not in _TRANSFORM_MAP:
-      raise ValueError("Unsupported feature_transform. The supported "
-                       "feature_transform are %s. "
-                       "Got '%s'." % (_TRANSFORM_MAP.keys(), feature_transform))
-    if num_random_features <= 0 and redraw:
-      raise ValueError(
-          "There is nothing to redraw when num_random_features <= 0.")
-    self._feature_transform = feature_transform
-    self._num_random_features = num_random_features
-    self._redraw = redraw
-    self._is_short_seq = is_short_seq
-    self._begin_kernel = begin_kernel
-    # We use the seed for two scenarios:
-    # 1. inference
-    # 2. no redraw
-    self._seed = seed
-
-    super().__init__(**kwargs)
-    self._projection_matrix = None
-    if num_random_features > 0:
-      self._projection_matrix = create_projection_matrix(
-          self._num_random_features, self._key_dim,
-          tf.constant([self._seed, self._seed + 1]))
-
-  def _compute_attention(self,
-                         query,
-                         key,
-                         value,
-                         feature_transform,
-                         is_short_seq,
-                         attention_mask=None,
-                         training=False,
-                         numeric_stabler=_NUMERIC_STABLER):
-    """Applies kernel attention with query, key, value tensors.
-
-    This function defines the computation inside `call` with projected
-    multi-head Q, K, V inputs. Users can override this function for customized
-    attention implementation.
-
-    Args:
-      query: Projected query `Tensor` of shape `[B, T, N, key_dim]`.
-      key: Projected key `Tensor` of shape `[B, S, N, key_dim]`.
-      value: Projected value `Tensor` of shape `[B, S, N, value_dim]`.
-      feature_transform: A non-linear transform of the keys and quries.
-      is_short_seq: boolean predicate indicating whether input data consists of
-        short or long sequences; usually short sequence is defined as having
-        length L <= 1024.
-      attention_mask: a boolean mask of shape `[B, S]`, that prevents
-        attenting to masked positions. Note that the mask is only appied to
-        the keys. User may want to mask the output if query contains pads.
-      training: Python boolean indicating whether the layer should behave in
-        training mode (adding dropout) or in inference mode (doing nothing).
-      numeric_stabler: A scalar value added to avoid divide by 0.
-
-    Returns:
-      attention_output: Multi-headed outputs of attention computation.
-    """
-
-    projection_matrix = None
-    if self._num_random_features > 0:
-      if self._redraw and training:
-        projection_matrix = create_projection_matrix(self._num_random_features,
-                                                     self._key_dim)
-      else:
-        projection_matrix = self._projection_matrix
-
-    key = _TRANSFORM_MAP[feature_transform](key, projection_matrix, False)
-    query = _TRANSFORM_MAP[feature_transform](query, projection_matrix, True)
-
-    if attention_mask is not None:
-      key = tf.einsum("BSNH,BS->BSNH", key, attention_mask)
-
-    if is_short_seq:
-      attention_scores = tf.einsum("BTNH,BSNH->BTSN", query, key)
-      attention_scores = tf.nn.softmax(attention_scores, axis=2)
-      attention_output = tf.einsum("BTSN,BSNH->BTNH", attention_scores, value)
-      return attention_output
-    else:
-      kv = tf.einsum("BSNH,BSND->BNDH", key, value)
-      denominator = 1.0 / (
-          tf.einsum("BTNH,BNH->BTN", query, tf.reduce_sum(key, axis=1)) +
-          _NUMERIC_STABLER)
-      return tf.einsum("BTNH,BNDH,BTN->BTND", query, kv, denominator)
-
-  def _build_from_signature(self, query, value, key=None):
-    super()._build_from_signature(query=query, value=value, key=key)
-    if self._begin_kernel > 0:
-      common_kwargs = dict(
-          kernel_initializer=self._kernel_initializer,
-          bias_initializer=self._bias_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer,
-          activity_regularizer=self._activity_regularizer,
-          kernel_constraint=self._kernel_constraint,
-          bias_constraint=self._bias_constraint)
-      self._output_dense_softmax = self._make_output_dense(
-          self._query_shape.rank - 1, common_kwargs,
-          name="attention_output_softmax")
-      self._dropout_softmax = tf.keras.layers.Dropout(rate=self._dropout)
-
-  def call(self,
-           query,
-           value,
-           key=None,
-           attention_mask=None,
-           training=False):
-    """Compute attention with kernel mechanism.
-
-    Args:
-      query: Query `Tensor` of shape `[B, T, dim]`.
-      value: Value `Tensor` of shape `[B, S, dim]`.
-      key: Optional key `Tensor` of shape `[B, S, dim]`. If not given, will use
-        `value` for both `key` and `value`, which is the most common case.
-      attention_mask: a boolean mask of shape `[B, S]`, that prevents
-        attenting to masked positions. Note that the mask is only appied to
-        the keys. User may want to mask the output if query contains pads.
-      training: Python boolean indicating whether the layer should behave in
-        training mode (adding dropout) or in inference mode (doing nothing).
-
-    Returns:
-      Multi-headed outputs of attention computation.
-    """
-    if not self._built_from_signature:
-      self._build_from_signature(query=query, value=value, key=key)
-    if key is None:
-      key = value
-
-    #   N = `num_attention_heads`
-    #   H = `size_per_head`
-    # `query` = [B, T, N ,H]
-    query = self._query_dense(query)
-
-    # `key` = [B, S, N, H]
-    key = self._key_dense(key)
-
-    # `value` = [B, S, N, D]
-    value = self._value_dense(value)
-
-    if self._begin_kernel > 0:
-      attention_output_softmax = self._compute_attention(
-          query[:, :self._begin_kernel],
-          key, value, "identity", True, attention_mask, training)
-      attention_output_softmax = self._dropout_softmax(attention_output_softmax)
-      attention_output_softmax = self._output_dense_softmax(
-          attention_output_softmax)
-
-      attention_output_kernel = self._compute_attention(
-          query[:, self._begin_kernel:],
-          key, value, self._feature_transform, self._is_short_seq,
-          attention_mask, training)
-      attention_output_kernel = self._dropout_layer(attention_output_kernel)
-      attention_output_kernel = self._output_dense(
-          attention_output_kernel)
-      attention_output = tf.concat(
-          [attention_output_softmax, attention_output_kernel], axis=1)
-    else:
-      attention_output = self._compute_attention(
-          query, key, value, self._feature_transform,
-          self._is_short_seq, attention_mask, training)
-      # This is actually dropping out entire tokens to attend to, which might
-      # seem a bit unusual, but is taken from the original Transformer paper.
-      attention_output = self._dropout_layer(attention_output)
-      attention_output = self._output_dense(attention_output)
-    return attention_output
-
-  def get_config(self):
-    config = {
-        "feature_transform": self._feature_transform,
-        "num_random_features": self._num_random_features,
-        "seed": self._seed,
-        "redraw": self._redraw,
-        "is_short_seq": self._is_short_seq,
-        "begin_kernel": self._begin_kernel,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/kernel_attention_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/kernel_attention_test.py
deleted file mode 100644
index 90057d04824a6d9e4a432732ac165611b5456b26..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/kernel_attention_test.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.projects.kernel.attention."""
-import itertools
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.nlp.modeling.layers import kernel_attention as attention
-
-
-_FEATURE_TRANSFORM = ['relu', 'elu', 'exp', 'l2']
-_REDRAW = [True, False]
-_TRAINING = [True, False]
-_IS_SHORT_SEQ = [True, False]
-_BEGIN_KERNEL = [0, 512]
-
-
-class KernelAttentionTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters(itertools.product(
-      _FEATURE_TRANSFORM, [127], _TRAINING, [True, False],
-      _IS_SHORT_SEQ, _BEGIN_KERNEL))
-  def test_attention_projection(
-      self, feature_transform, num_random_features, training, redraw, is_short,
-      begin_kernel):
-    num_heads = 12
-    key_dim = 64
-    seq_length = 1024
-    batch_size = 2
-    test_layer = attention.KernelAttention(
-        num_heads=num_heads,
-        key_dim=key_dim,
-        feature_transform=feature_transform,
-        num_random_features=num_random_features,
-        redraw=redraw,
-        is_short_seq=is_short,
-        begin_kernel=begin_kernel)
-    query = tf.random.normal(
-        shape=(batch_size, seq_length, key_dim))
-    value = query
-    encoder_inputs_mask = tf.zeros((batch_size, seq_length), dtype=tf.int32)
-    masks = tf.cast(encoder_inputs_mask, dtype=tf.float32)
-    output = test_layer(
-        query=query,
-        value=value,
-        attention_mask=masks,
-        training=training)
-    self.assertEqual(output.shape, [batch_size, seq_length, key_dim])
-
-  @parameterized.parameters(itertools.product(
-      _FEATURE_TRANSFORM, [0], _TRAINING, [False],
-      _IS_SHORT_SEQ, _BEGIN_KERNEL))
-  def test_attention_no_projection(
-      self, feature_transform, num_random_features, training, redraw, is_short,
-      begin_kernel):
-    num_heads = 12
-    key_dim = 64
-    seq_length = 1024
-    batch_size = 2
-    test_layer = attention.KernelAttention(
-        num_heads=num_heads,
-        key_dim=key_dim,
-        feature_transform=feature_transform,
-        num_random_features=num_random_features,
-        redraw=redraw,
-        is_short_seq=is_short,
-        begin_kernel=begin_kernel)
-    query = tf.random.normal(
-        shape=(batch_size, seq_length, key_dim))
-    value = query
-    encoder_inputs_mask = tf.zeros((batch_size, seq_length), dtype=tf.int32)
-    masks = tf.cast(encoder_inputs_mask, dtype=tf.float32)
-    output = test_layer(
-        query=query,
-        value=value,
-        attention_mask=masks,
-        training=training)
-    self.assertEqual(output.shape, [batch_size, seq_length, key_dim])
-
-  def test_unsupported_feature_transform(self):
-    with self.assertRaisesRegex(ValueError, 'Unsupported feature_transform.*'):
-      _ = attention.KernelAttention(feature_transform='test')
-
-  def test_redraw_true_no_projection(self):
-    with self.assertRaisesRegex(
-        ValueError, 'There is nothing to redraw when num_random_features.*'):
-      _ = attention.KernelAttention(
-          num_heads=2, key_dim=64, feature_transform='elu',
-          num_random_features=0, redraw=True)
-
-  def test_config(self):
-    num_heads = 12
-    key_dim = 64
-    test_layer = attention.KernelAttention(
-        num_heads=num_heads,
-        key_dim=key_dim,
-        feature_transform='exp',
-        num_random_features=128,
-        is_short_seq=True)
-    new_layer = attention.KernelAttention.from_config(
-        test_layer.get_config())
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_lm.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_lm.py
deleted file mode 100644
index 31b14346ed9024e55ef60a72da839205fafd648c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_lm.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Masked language model network."""
-# pylint: disable=g-classes-have-attributes
-from official.nlp import keras_nlp
-
-
-MaskedLM = keras_nlp.layers.MaskedLM
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_lm_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_lm_test.py
deleted file mode 100644
index 3a2d0ba139f87c4082290cbd4297a4b4276fe45c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_lm_test.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for masked language model network."""
-
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-
-from official.nlp.modeling.layers import masked_lm
-from official.nlp.modeling.networks import bert_encoder
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class MaskedLMTest(keras_parameterized.TestCase):
-
-  def create_layer(self,
-                   vocab_size,
-                   hidden_size,
-                   output='predictions',
-                   xformer_stack=None):
-    # First, create a transformer stack that we can use to get the LM's
-    # vocabulary weight.
-    if xformer_stack is None:
-      xformer_stack = bert_encoder.BertEncoder(
-          vocab_size=vocab_size,
-          num_layers=1,
-          hidden_size=hidden_size,
-          num_attention_heads=4,
-      )
-
-    # Create a maskedLM from the transformer stack.
-    test_layer = masked_lm.MaskedLM(
-        embedding_table=xformer_stack.get_embedding_table(), output=output)
-    return test_layer
-
-  def test_layer_creation(self):
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    num_predictions = 21
-    test_layer = self.create_layer(
-        vocab_size=vocab_size, hidden_size=hidden_size)
-
-    # Make sure that the output tensor of the masked LM is the right shape.
-    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
-    output = test_layer(lm_input_tensor, masked_positions=masked_positions)
-
-    expected_output_shape = [None, num_predictions, vocab_size]
-    self.assertEqual(expected_output_shape, output.shape.as_list())
-
-  def test_layer_invocation_with_external_logits(self):
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    num_predictions = 21
-    xformer_stack = bert_encoder.BertEncoder(
-        vocab_size=vocab_size,
-        num_layers=1,
-        hidden_size=hidden_size,
-        num_attention_heads=4,
-    )
-    test_layer = self.create_layer(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        xformer_stack=xformer_stack,
-        output='predictions')
-    logit_layer = self.create_layer(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        xformer_stack=xformer_stack,
-        output='logits')
-
-    # Create a model from the masked LM layer.
-    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
-    output = test_layer(lm_input_tensor, masked_positions)
-    logit_output = logit_layer(lm_input_tensor, masked_positions)
-    logit_output = tf.keras.layers.Activation(tf.nn.log_softmax)(logit_output)
-    logit_layer.set_weights(test_layer.get_weights())
-    model = tf.keras.Model([lm_input_tensor, masked_positions], output)
-    logits_model = tf.keras.Model(([lm_input_tensor, masked_positions]),
-                                  logit_output)
-
-    # Invoke the masked LM on some fake data to make sure there are no runtime
-    # errors in the code.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        sequence_length, size=(batch_size, num_predictions))
-    # ref_outputs = model.predict([lm_input_data, masked_position_data])
-    # outputs = logits_model.predict([lm_input_data, masked_position_data])
-    ref_outputs = model([lm_input_data, masked_position_data])
-    outputs = logits_model([lm_input_data, masked_position_data])
-
-    # Ensure that the tensor shapes are correct.
-    expected_output_shape = (batch_size, num_predictions, vocab_size)
-    self.assertEqual(expected_output_shape, ref_outputs.shape)
-    self.assertEqual(expected_output_shape, outputs.shape)
-    self.assertAllClose(ref_outputs, outputs)
-
-  def test_layer_invocation(self):
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    num_predictions = 21
-    test_layer = self.create_layer(
-        vocab_size=vocab_size, hidden_size=hidden_size)
-
-    # Create a model from the masked LM layer.
-    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
-    output = test_layer(lm_input_tensor, masked_positions)
-    model = tf.keras.Model([lm_input_tensor, masked_positions], output)
-
-    # Invoke the masked LM on some fake data to make sure there are no runtime
-    # errors in the code.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
-    _ = model.predict([lm_input_data, masked_position_data])
-
-  def test_unknown_output_type_fails(self):
-    with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
-      _ = self.create_layer(vocab_size=8, hidden_size=8, output='bad')
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_softmax.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_softmax.py
deleted file mode 100644
index 625860087a3462aa1b735f300435db6808ce1901..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_softmax.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based softmax layer with optional masking."""
-# pylint: disable=g-classes-have-attributes
-
-import tensorflow as tf
-
-
-def _large_compatible_negative(tensor_type):
-  """Large negative number as Tensor.
-
-  This function is necessary because the standard value for epsilon
-  in this module (-1e9) cannot be represented using `tf.float16`.
-
-  Args:
-    tensor_type: A dtype to determine the type.
-
-  Returns:
-    A large negative number.
-  """
-  if tensor_type == tf.float16:
-    return tf.float16.min
-  return -1e9
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class MaskedSoftmax(tf.keras.layers.Layer):
-  """Performs a softmax with optional masking on a tensor.
-
-  Args:
-    mask_expansion_axes: Any axes that should be padded on the mask tensor.
-    normalization_axes: On which axes the softmax should perform.
-  """
-
-  def __init__(self,
-               mask_expansion_axes=None,
-               normalization_axes=None,
-               **kwargs):
-    self._mask_expansion_axes = mask_expansion_axes
-    if normalization_axes is None:
-      self._normalization_axes = (-1,)
-    else:
-      self._normalization_axes = normalization_axes
-    super(MaskedSoftmax, self).__init__(**kwargs)
-
-  def call(self, scores, mask=None):
-
-    if mask is not None:
-      for _ in range(len(scores.shape) - len(mask.shape)):
-        mask = tf.expand_dims(mask, axis=self._mask_expansion_axes)
-
-      # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-      # masked positions, this operation will create a tensor which is 0.0 for
-      # positions we want to attend and -1.e9 for masked positions.
-      adder = (1.0 - tf.cast(mask, scores.dtype)) * _large_compatible_negative(
-          scores.dtype)
-      # Since we are adding it to the raw scores before the softmax, this is
-      # effectively the same as removing these entirely.
-      scores += adder
-
-    if len(self._normalization_axes) == 1:
-      return tf.nn.softmax(scores, axis=self._normalization_axes[0])
-    else:
-      return tf.math.exp(scores - tf.math.reduce_logsumexp(
-          scores, axis=self._normalization_axes, keepdims=True))
-
-  def get_config(self):
-    config = {
-        'mask_expansion_axes': self._mask_expansion_axes,
-        'normalization_axes': self._normalization_axes
-    }
-    base_config = super(MaskedSoftmax, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_softmax_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_softmax_test.py
deleted file mode 100644
index cc63116c30283f1ed2e4ab165f1c269cc12dd396..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/masked_softmax_test.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for Keras-based masked softmax layer."""
-
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import masked_softmax
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class MaskedSoftmaxLayerTest(keras_parameterized.TestCase):
-
-  def test_non_masked_softmax(self):
-    test_layer = masked_softmax.MaskedSoftmax()
-    input_tensor = tf.keras.Input(shape=(4, 8))
-    output = test_layer(input_tensor)
-    model = tf.keras.Model(input_tensor, output)
-
-    input_data = 10 * np.random.random_sample((3, 4, 8))
-    output_data = model.predict(input_data)
-    expected_data = tf.nn.softmax(input_data)
-    self.assertAllClose(expected_data, output_data)
-
-  def test_masked_softmax(self):
-    test_layer = masked_softmax.MaskedSoftmax()
-    input_tensor = tf.keras.Input(shape=(4, 8))
-    mask_tensor = tf.keras.Input(shape=(4, 8))
-    output = test_layer(input_tensor, mask_tensor)
-    model = tf.keras.Model([input_tensor, mask_tensor], output)
-
-    input_data = 10 * np.random.random_sample((3, 4, 8))
-    mask_data = np.random.randint(2, size=(3, 4, 8))
-
-    output_data = model.predict([input_data, mask_data])
-    expected_zeros = np.greater(mask_data, 0)
-    is_zeros = np.greater(output_data, 0)
-    self.assertAllEqual(expected_zeros, is_zeros)
-
-  def test_masked_softmax_with_none_mask(self):
-    test_layer = masked_softmax.MaskedSoftmax()
-    input_tensor = tf.keras.Input(shape=(4, 8))
-    output = test_layer(input_tensor, None)
-    model = tf.keras.Model(input_tensor, output)
-
-    input_data = 10 * np.random.random_sample((3, 4, 8))
-    output_data = model.predict(input_data)
-    expected_data = tf.nn.softmax(input_data)
-    self.assertAllClose(expected_data, output_data)
-
-  def test_softmax_with_axes_expansion(self):
-    test_layer = masked_softmax.MaskedSoftmax(mask_expansion_axes=[1])
-    input_tensor = tf.keras.Input(shape=(4, 8))
-    mask_tensor = tf.keras.Input(shape=(8))
-    output = test_layer(input_tensor, mask_tensor)
-    model = tf.keras.Model([input_tensor, mask_tensor], output)
-
-    input_data = 10 * np.random.random_sample((3, 4, 8))
-    mask_data = np.random.randint(2, size=(3, 8))
-
-    output_data = model.predict([input_data, mask_data])
-    expanded_mask = np.expand_dims(mask_data, axis=1) * np.ones_like(input_data)
-    expected_zeros = np.greater(expanded_mask, 0)
-    is_zeros = np.greater(output_data, 0)
-    self.assertAllEqual(expected_zeros, is_zeros)
-
-  def test_masked_softmax_high_dims(self):
-    test_layer = masked_softmax.MaskedSoftmax(
-        mask_expansion_axes=[1], normalization_axes=[6, 7])
-    input_shape = [2, 3, 4, 5, 6, 7, 8]
-    mask_shape = [5, 6, 7, 8]
-    input_tensor = tf.keras.Input(shape=input_shape)
-    mask_tensor = tf.keras.Input(shape=mask_shape)
-    output = test_layer(input_tensor, mask_tensor)
-    model = tf.keras.Model([input_tensor, mask_tensor], output)
-
-    input_data = 10 * np.random.random_sample([3] + input_shape)
-    mask_data = np.random.randint(2, size=[3] + mask_shape)
-
-    output_data = model.predict([input_data, mask_data])
-    expanded_mask = np.expand_dims(mask_data, axis=1)
-    expanded_mask = np.expand_dims(expanded_mask, axis=1)
-    expanded_mask = np.expand_dims(
-        expanded_mask, axis=1) * np.ones_like(input_data)
-    expected_zeros = np.greater(expanded_mask, 0)
-    is_zeros = np.greater(output_data, 0)
-    self.assertAllEqual(expected_zeros, is_zeros)
-
-  def test_serialize_deserialize(self):
-    test_layer = masked_softmax.MaskedSoftmax(
-        mask_expansion_axes=[1], normalization_axes=[6, 7])
-    new_layer = masked_softmax.MaskedSoftmax.from_config(
-        test_layer.get_config())
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mat_mul_with_margin.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mat_mul_with_margin.py
deleted file mode 100644
index 452d9b07a7cfa65eeb750980e2a562fa3d350f61..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mat_mul_with_margin.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Dot product with margin layer."""
-# pylint: disable=g-classes-have-attributes
-
-from typing import Tuple
-# Import libraries
-import tensorflow as tf
-
-from official.modeling import tf_utils
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class MatMulWithMargin(tf.keras.layers.Layer):
-  """This layer computs a dot product matrix given two encoded inputs.
-
-  Args:
-    logit_scale: The scaling factor of dot products when doing training.
-    logit_margin: The margin value between the positive and negative examples
-      when doing training.
-  """
-
-  def __init__(self,
-               logit_scale=1.0,
-               logit_margin=0.0,
-               **kwargs):
-    super(MatMulWithMargin, self).__init__(**kwargs)
-    self.logit_scale = logit_scale
-    self.logit_margin = logit_margin
-
-  def call(self, left_encoded: tf.Tensor,
-           right_encoded: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
-    batch_size = tf_utils.get_shape_list(
-        left_encoded, name='sequence_output_tensor')[0]
-
-    # Left -> Right dot product.
-    left_dot_products = tf.matmul(
-        left_encoded, right_encoded, transpose_b=True)
-
-    self.left_logits = self.logit_scale * (
-        left_dot_products - self.logit_margin * tf.eye(batch_size))
-
-    # Right -> Left dot product.
-    self.right_logits = tf.transpose(self.left_logits)
-
-    return (self.left_logits, self.right_logits)
-
-  def get_config(self):
-    config = {
-        'logit_scale': self.logit_scale,
-        'logit_margin': self.logit_margin}
-    config.update(super(MatMulWithMargin, self).get_config())
-    return config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mat_mul_with_margin_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mat_mul_with_margin_test.py
deleted file mode 100644
index d991d1814af4400889164f615a9b5fbafb6f4a5a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mat_mul_with_margin_test.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for mat_mul_with_margin layer."""
-
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import mat_mul_with_margin
-
-
-class MatMulWithMarginTest(keras_parameterized.TestCase):
-
-  def test_layer_invocation(self):
-    """Validate that the Keras object can be created and invoked."""
-    input_width = 512
-    test_layer = mat_mul_with_margin.MatMulWithMargin()
-    # Create a 2-dimensional input (the first dimension is implicit).
-    left_encoded = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
-    right_encoded = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
-    left_logits, right_logits = test_layer(left_encoded, right_encoded)
-
-    # Validate that the outputs are of the expected shape.
-    expected_output_shape = [None, None]
-    self.assertEqual(expected_output_shape, left_logits.shape.as_list())
-    self.assertEqual(expected_output_shape, right_logits.shape.as_list())
-
-  def test_serialize_deserialize(self):
-    # Create a layer object that sets all of its config options.
-    layer = mat_mul_with_margin.MatMulWithMargin()
-
-    # Create another layer object from the first object's config.
-    new_layer = mat_mul_with_margin.MatMulWithMargin.from_config(
-        layer.get_config())
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(layer.get_config(), new_layer.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mobile_bert_layers.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mobile_bert_layers.py
deleted file mode 100644
index a0c54255aec46ff447b8e151980d48fee7d0cc61..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mobile_bert_layers.py
+++ /dev/null
@@ -1,570 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""MobileBERT embedding and transformer layers."""
-import tensorflow as tf
-
-from official.nlp import keras_nlp
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class NoNorm(tf.keras.layers.Layer):
-  """Apply element-wise linear transformation to the last dimension."""
-
-  def __init__(self, name=None):
-    super(NoNorm, self).__init__(name=name)
-
-  def build(self, shape):
-    kernal_size = shape[-1]
-    self.bias = self.add_weight('beta',
-                                shape=[kernal_size],
-                                initializer='zeros')
-    self.scale = self.add_weight('gamma',
-                                 shape=[kernal_size],
-                                 initializer='ones')
-
-  def call(self, feature):
-    output = feature * self.scale + self.bias
-    return output
-
-
-def _get_norm_layer(normalization_type='no_norm', name=None):
-  """Get normlization layer.
-
-  Args:
-      normalization_type: String. The type of normalization_type, only
-        `no_norm` and `layer_norm` are supported.
-      name: Name for the norm layer.
-
-  Returns:
-    layer norm class.
-  """
-  if normalization_type == 'no_norm':
-    layer = NoNorm(name=name)
-  elif normalization_type == 'layer_norm':
-    layer = tf.keras.layers.LayerNormalization(
-        name=name,
-        axis=-1,
-        epsilon=1e-12,
-        dtype=tf.float32)
-  else:
-    raise NotImplementedError('Only "no_norm" and "layer_norm" and supported.')
-  return layer
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class MobileBertEmbedding(tf.keras.layers.Layer):
-  """Performs an embedding lookup for MobileBERT.
-
-  This layer includes word embedding, token type embedding, position embedding.
-  """
-
-  def __init__(self,
-               word_vocab_size,
-               word_embed_size,
-               type_vocab_size,
-               output_embed_size,
-               max_sequence_length=512,
-               normalization_type='no_norm',
-               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-               dropout_rate=0.1,
-               **kwargs):
-    """Class initialization.
-
-    Args:
-      word_vocab_size: Number of words in the vocabulary.
-      word_embed_size: Word embedding size.
-      type_vocab_size: Number of word types.
-      output_embed_size: Embedding size for the final embedding output.
-      max_sequence_length: Maximum length of input sequence.
-      normalization_type: String. The type of normalization_type, only
-        `no_norm` and `layer_norm` are supported.
-      initializer: The initializer to use for the embedding weights and
-        linear projection weights.
-      dropout_rate: Dropout rate.
-      **kwargs: keyword arguments.
-    """
-    super(MobileBertEmbedding, self).__init__(**kwargs)
-    self.word_vocab_size = word_vocab_size
-    self.word_embed_size = word_embed_size
-    self.type_vocab_size = type_vocab_size
-    self.output_embed_size = output_embed_size
-    self.max_sequence_length = max_sequence_length
-    self.normalization_type = normalization_type
-    self.initializer = tf.keras.initializers.get(initializer)
-    self.dropout_rate = dropout_rate
-
-    self.word_embedding = keras_nlp.layers.OnDeviceEmbedding(
-        self.word_vocab_size,
-        self.word_embed_size,
-        initializer=initializer,
-        name='word_embedding')
-    self.type_embedding = keras_nlp.layers.OnDeviceEmbedding(
-        self.type_vocab_size,
-        self.output_embed_size,
-        initializer=initializer,
-        name='type_embedding')
-    self.pos_embedding = keras_nlp.layers.PositionEmbedding(
-        max_length=max_sequence_length,
-        initializer=initializer,
-        name='position_embedding')
-    self.word_embedding_proj = tf.keras.layers.experimental.EinsumDense(
-        'abc,cd->abd',
-        output_shape=[None, self.output_embed_size],
-        kernel_initializer=initializer,
-        bias_axes='d',
-        name='embedding_projection')
-    self.layer_norm = _get_norm_layer(normalization_type, 'embedding_norm')
-    self.dropout_layer = tf.keras.layers.Dropout(
-        self.dropout_rate,
-        name='embedding_dropout')
-
-  def get_config(self):
-    config = {
-        'word_vocab_size': self.word_vocab_size,
-        'word_embed_size': self.word_embed_size,
-        'type_vocab_size': self.type_vocab_size,
-        'output_embed_size': self.output_embed_size,
-        'max_sequence_length': self.max_sequence_length,
-        'normalization_type': self.normalization_type,
-        'initializer': tf.keras.initializers.serialize(self.initializer),
-        'dropout_rate': self.dropout_rate
-    }
-    base_config = super(MobileBertEmbedding, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, input_ids, token_type_ids=None):
-    word_embedding_out = self.word_embedding(input_ids)
-    word_embedding_out = tf.concat(
-        [tf.pad(word_embedding_out[:, 1:], ((0, 0), (0, 1), (0, 0))),
-         word_embedding_out,
-         tf.pad(word_embedding_out[:, :-1], ((0, 0), (1, 0), (0, 0)))],
-        axis=2)
-    word_embedding_out = self.word_embedding_proj(word_embedding_out)
-
-    pos_embedding_out = self.pos_embedding(word_embedding_out)
-    embedding_out = word_embedding_out + pos_embedding_out
-    if token_type_ids is not None:
-      type_embedding_out = self.type_embedding(token_type_ids)
-      embedding_out += type_embedding_out
-    embedding_out = self.layer_norm(embedding_out)
-    embedding_out = self.dropout_layer(embedding_out)
-
-    return embedding_out
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class MobileBertTransformer(tf.keras.layers.Layer):
-  """Transformer block for MobileBERT.
-
-  An implementation of one layer (block) of Transformer with bottleneck and
-  inverted-bottleneck for MobilerBERT.
-
-  Original paper for MobileBERT:
-  https://arxiv.org/pdf/2004.02984.pdf
-  """
-
-  def __init__(self,
-               hidden_size=512,
-               num_attention_heads=4,
-               intermediate_size=512,
-               intermediate_act_fn='relu',
-               hidden_dropout_prob=0.1,
-               attention_probs_dropout_prob=0.1,
-               intra_bottleneck_size=128,
-               use_bottleneck_attention=False,
-               key_query_shared_bottleneck=True,
-               num_feedforward_networks=4,
-               normalization_type='no_norm',
-               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-               **kwargs):
-    """Class initialization.
-
-    Args:
-      hidden_size: Hidden size for the Transformer input and output tensor.
-      num_attention_heads: Number of attention heads in the Transformer.
-      intermediate_size: The size of the "intermediate" (a.k.a., feed
-        forward) layer.
-      intermediate_act_fn: The non-linear activation function to apply
-        to the output of the intermediate/feed-forward layer.
-      hidden_dropout_prob: Dropout probability for the hidden layers.
-      attention_probs_dropout_prob: Dropout probability of the attention
-        probabilities.
-      intra_bottleneck_size: Size of bottleneck.
-      use_bottleneck_attention: Use attention inputs from the bottleneck
-        transformation. If true, the following `key_query_shared_bottleneck`
-        will be ignored.
-      key_query_shared_bottleneck: Whether to share linear transformation for
-        keys and queries.
-      num_feedforward_networks: Number of stacked feed-forward networks.
-      normalization_type: The type of normalization_type, only `no_norm` and
-        `layer_norm` are supported. `no_norm` represents the element-wise
-        linear transformation for the student model, as suggested by the
-        original MobileBERT paper. `layer_norm` is used for the teacher model.
-      initializer: The initializer to use for the embedding weights and
-        linear projection weights.
-      **kwargs: keyword arguments.
-
-    Raises:
-      ValueError: A Tensor shape or parameter is invalid.
-    """
-    super(MobileBertTransformer, self).__init__(**kwargs)
-    self.hidden_size = hidden_size
-    self.num_attention_heads = num_attention_heads
-    self.intermediate_size = intermediate_size
-    self.intermediate_act_fn = intermediate_act_fn
-    self.hidden_dropout_prob = hidden_dropout_prob
-    self.attention_probs_dropout_prob = attention_probs_dropout_prob
-    self.intra_bottleneck_size = intra_bottleneck_size
-    self.use_bottleneck_attention = use_bottleneck_attention
-    self.key_query_shared_bottleneck = key_query_shared_bottleneck
-    self.num_feedforward_networks = num_feedforward_networks
-    self.normalization_type = normalization_type
-    self.initializer = tf.keras.initializers.get(initializer)
-
-    if intra_bottleneck_size % num_attention_heads != 0:
-      raise ValueError(
-          (f'The bottleneck size {intra_bottleneck_size} is not a multiple '
-           f'of the number of attention heads {num_attention_heads}.'))
-    attention_head_size = int(intra_bottleneck_size / num_attention_heads)
-
-    self.block_layers = {}
-    # add input bottleneck
-    dense_layer_2d = tf.keras.layers.experimental.EinsumDense(
-        'abc,cd->abd',
-        output_shape=[None, self.intra_bottleneck_size],
-        bias_axes='d',
-        kernel_initializer=initializer,
-        name='bottleneck_input/dense')
-    layer_norm = _get_norm_layer(self.normalization_type,
-                                 name='bottleneck_input/norm')
-    self.block_layers['bottleneck_input'] = [dense_layer_2d,
-                                             layer_norm]
-
-    if self.key_query_shared_bottleneck:
-      dense_layer_2d = tf.keras.layers.experimental.EinsumDense(
-          'abc,cd->abd',
-          output_shape=[None, self.intra_bottleneck_size],
-          bias_axes='d',
-          kernel_initializer=initializer,
-          name='kq_shared_bottleneck/dense')
-      layer_norm = _get_norm_layer(self.normalization_type,
-                                   name='kq_shared_bottleneck/norm')
-      self.block_layers['kq_shared_bottleneck'] = [dense_layer_2d,
-                                                   layer_norm]
-
-    # add attention layer
-    attention_layer = tf.keras.layers.MultiHeadAttention(
-        num_heads=self.num_attention_heads,
-        key_dim=attention_head_size,
-        value_dim=attention_head_size,
-        dropout=self.attention_probs_dropout_prob,
-        output_shape=self.intra_bottleneck_size,
-        kernel_initializer=initializer,
-        name='attention')
-    layer_norm = _get_norm_layer(self.normalization_type,
-                                 name='attention/norm')
-    self.block_layers['attention'] = [attention_layer,
-                                      layer_norm]
-
-    # add stacked feed-forward networks
-    self.block_layers['ffn'] = []
-    for ffn_layer_idx in range(self.num_feedforward_networks):
-      layer_prefix = f'ffn_layer_{ffn_layer_idx}'
-      layer_name = layer_prefix + '/intermediate_dense'
-      intermediate_layer = tf.keras.layers.experimental.EinsumDense(
-          'abc,cd->abd',
-          activation=self.intermediate_act_fn,
-          output_shape=[None, self.intermediate_size],
-          bias_axes='d',
-          kernel_initializer=initializer,
-          name=layer_name)
-      layer_name = layer_prefix + '/output_dense'
-      output_layer = tf.keras.layers.experimental.EinsumDense(
-          'abc,cd->abd',
-          output_shape=[None, self.intra_bottleneck_size],
-          bias_axes='d',
-          kernel_initializer=initializer,
-          name=layer_name)
-      layer_name = layer_prefix + '/norm'
-      layer_norm = _get_norm_layer(self.normalization_type,
-                                   name=layer_name)
-      self.block_layers['ffn'].append([intermediate_layer,
-                                       output_layer,
-                                       layer_norm])
-
-    # add output bottleneck
-    bottleneck = tf.keras.layers.experimental.EinsumDense(
-        'abc,cd->abd',
-        output_shape=[None, self.hidden_size],
-        activation=None,
-        bias_axes='d',
-        kernel_initializer=initializer,
-        name='bottleneck_output/dense')
-    dropout_layer = tf.keras.layers.Dropout(
-        self.hidden_dropout_prob,
-        name='bottleneck_output/dropout')
-    layer_norm = _get_norm_layer(self.normalization_type,
-                                 name='bottleneck_output/norm')
-    self.block_layers['bottleneck_output'] = [bottleneck,
-                                              dropout_layer,
-                                              layer_norm]
-
-  def get_config(self):
-    config = {
-        'hidden_size': self.hidden_size,
-        'num_attention_heads': self.num_attention_heads,
-        'intermediate_size': self.intermediate_size,
-        'intermediate_act_fn': self.intermediate_act_fn,
-        'hidden_dropout_prob': self.hidden_dropout_prob,
-        'attention_probs_dropout_prob': self.attention_probs_dropout_prob,
-        'intra_bottleneck_size': self.intra_bottleneck_size,
-        'use_bottleneck_attention': self.use_bottleneck_attention,
-        'key_query_shared_bottleneck': self.key_query_shared_bottleneck,
-        'num_feedforward_networks': self.num_feedforward_networks,
-        'normalization_type': self.normalization_type,
-        'initializer': tf.keras.initializers.serialize(self.initializer),
-    }
-    base_config = super(MobileBertTransformer, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self,
-           input_tensor,
-           attention_mask=None,
-           return_attention_scores=False):
-    """Implementes the forward pass.
-
-    Args:
-      input_tensor: Float tensor of shape
-        `(batch_size, seq_length, hidden_size)`.
-      attention_mask: (optional) int32 tensor of shape
-        `(batch_size, seq_length, seq_length)`, with 1 for positions that can
-        be attended to and 0 in positions that should not be.
-      return_attention_scores: If return attention score.
-
-    Returns:
-      layer_output: Float tensor of shape
-        `(batch_size, seq_length, hidden_size)`.
-      attention_scores (Optional): Only when return_attention_scores is True.
-
-    Raises:
-      ValueError: A Tensor shape or parameter is invalid.
-    """
-    input_width = input_tensor.shape.as_list()[-1]
-    if input_width != self.hidden_size:
-      raise ValueError(
-          (f'The width of the input tensor {input_width} != '
-           f'hidden size {self.hidden_size}'))
-
-    prev_output = input_tensor
-    # input bottleneck
-    dense_layer = self.block_layers['bottleneck_input'][0]
-    layer_norm = self.block_layers['bottleneck_input'][1]
-    layer_input = dense_layer(prev_output)
-    layer_input = layer_norm(layer_input)
-
-    if self.use_bottleneck_attention:
-      key_tensor = layer_input
-      query_tensor = layer_input
-      value_tensor = layer_input
-    elif self.key_query_shared_bottleneck:
-      dense_layer = self.block_layers['kq_shared_bottleneck'][0]
-      layer_norm = self.block_layers['kq_shared_bottleneck'][1]
-      shared_attention_input = dense_layer(prev_output)
-      shared_attention_input = layer_norm(shared_attention_input)
-      key_tensor = shared_attention_input
-      query_tensor = shared_attention_input
-      value_tensor = prev_output
-    else:
-      key_tensor = prev_output
-      query_tensor = prev_output
-      value_tensor = prev_output
-
-    # attention layer
-    attention_layer = self.block_layers['attention'][0]
-    layer_norm = self.block_layers['attention'][1]
-    attention_output, attention_scores = attention_layer(
-        query_tensor,
-        value_tensor,
-        key_tensor,
-        attention_mask,
-        return_attention_scores=True,
-    )
-    attention_output = layer_norm(attention_output + layer_input)
-
-    # stacked feed-forward networks
-    layer_input = attention_output
-    for ffn_idx in range(self.num_feedforward_networks):
-      intermediate_layer = self.block_layers['ffn'][ffn_idx][0]
-      output_layer = self.block_layers['ffn'][ffn_idx][1]
-      layer_norm = self.block_layers['ffn'][ffn_idx][2]
-      intermediate_output = intermediate_layer(layer_input)
-      layer_output = output_layer(intermediate_output)
-      layer_output = layer_norm(layer_output + layer_input)
-      layer_input = layer_output
-
-    # output bottleneck
-    bottleneck = self.block_layers['bottleneck_output'][0]
-    dropout_layer = self.block_layers['bottleneck_output'][1]
-    layer_norm = self.block_layers['bottleneck_output'][2]
-    layer_output = bottleneck(layer_output)
-    layer_output = dropout_layer(layer_output)
-    layer_output = layer_norm(layer_output + prev_output)
-
-    if return_attention_scores:
-      return layer_output, attention_scores
-    else:
-      return layer_output
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class MobileBertMaskedLM(tf.keras.layers.Layer):
-  """Masked language model network head for BERT modeling.
-
-  This layer implements a masked language model based on the provided
-  transformer based encoder. It assumes that the encoder network being passed
-  has a "get_embedding_table()" method. Different from canonical BERT's masked
-  LM layer, when the embedding width is smaller than hidden_size, it adds an
-  extra output weights in shape [vocab_size, (hidden_size - embedding_width)].
-  """
-
-  def __init__(self,
-               embedding_table,
-               activation=None,
-               initializer='glorot_uniform',
-               output='logits',
-               **kwargs):
-    """Class initialization.
-
-    Args:
-      embedding_table: The embedding table from encoder network.
-      activation: The activation, if any, for the dense layer.
-      initializer: The initializer for the dense layer. Defaults to a Glorot
-        uniform initializer.
-      output: The output style for this layer. Can be either `logits` or
-        `predictions`.
-      **kwargs: keyword arguments.
-    """
-    super(MobileBertMaskedLM, self).__init__(**kwargs)
-    self.embedding_table = embedding_table
-    self.activation = activation
-    self.initializer = tf.keras.initializers.get(initializer)
-
-    if output not in ('predictions', 'logits'):
-      raise ValueError(
-          ('Unknown `output` value "%s". `output` can be either "logits" or '
-           '"predictions"') % output)
-    self._output_type = output
-
-  def build(self, input_shape):
-    self._vocab_size, embedding_width = self.embedding_table.shape
-    hidden_size = input_shape[-1]
-    self.dense = tf.keras.layers.Dense(
-        hidden_size,
-        activation=self.activation,
-        kernel_initializer=self.initializer,
-        name='transform/dense')
-
-    if hidden_size > embedding_width:
-      self.extra_output_weights = self.add_weight(
-          'extra_output_weights',
-          shape=(self._vocab_size, hidden_size - embedding_width),
-          initializer=self.initializer,
-          trainable=True)
-    elif hidden_size == embedding_width:
-      self.extra_output_weights = None
-    else:
-      raise ValueError(
-          'hidden size %d cannot be smaller than embedding width %d.' %
-          (hidden_size, embedding_width))
-
-    self.layer_norm = tf.keras.layers.LayerNormalization(
-        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
-    self.bias = self.add_weight(
-        'output_bias/bias',
-        shape=(self._vocab_size,),
-        initializer='zeros',
-        trainable=True)
-
-    super(MobileBertMaskedLM, self).build(input_shape)
-
-  def call(self, sequence_data, masked_positions):
-    masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
-    lm_data = self.dense(masked_lm_input)
-    lm_data = self.layer_norm(lm_data)
-    if self.extra_output_weights is None:
-      lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
-    else:
-      lm_data = tf.matmul(
-          lm_data,
-          tf.concat([self.embedding_table, self.extra_output_weights], axis=1),
-          transpose_b=True)
-
-    logits = tf.nn.bias_add(lm_data, self.bias)
-    masked_positions_length = masked_positions.shape.as_list()[1] or tf.shape(
-        masked_positions)[1]
-    logits = tf.reshape(logits,
-                        [-1, masked_positions_length, self._vocab_size])
-    if self._output_type == 'logits':
-      return logits
-    return tf.nn.log_softmax(logits)
-
-  def get_config(self):
-    raise NotImplementedError('MaskedLM cannot be directly serialized because '
-                              'it has variable sharing logic.')
-
-  def _gather_indexes(self, sequence_tensor, positions):
-    """Gathers the vectors at the specific positions.
-
-    Args:
-      sequence_tensor: Sequence output of `BertModel` layer of shape
-        `(batch_size, seq_length, num_hidden)` where `num_hidden` is number of
-        hidden units of `BertModel` layer.
-      positions: Positions ids of tokens in sequence to mask for pretraining
-        of with dimension `(batch_size, num_predictions)` where
-        `num_predictions` is maximum number of tokens to mask out and predict
-        per each sequence.
-
-    Returns:
-      Masked out sequence tensor of shape
-        `(batch_size * num_predictions, num_hidden)`.
-    """
-    sequence_shape = tf.shape(sequence_tensor)
-    batch_size, seq_length = sequence_shape[0], sequence_shape[1]
-    width = sequence_tensor.shape.as_list()[2] or sequence_shape[2]
-
-    flat_offsets = tf.reshape(
-        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
-    flat_positions = tf.reshape(positions + flat_offsets, [-1])
-    flat_sequence_tensor = tf.reshape(sequence_tensor,
-                                      [batch_size * seq_length, width])
-    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
-
-    return output_tensor
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mobile_bert_layers_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mobile_bert_layers_test.py
deleted file mode 100644
index 786185acd624e44a8c82dcf072d583e5b640fa05..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/mobile_bert_layers_test.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from absl.testing import parameterized
-
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.modeling.layers import mobile_bert_layers
-from official.nlp.modeling.networks import mobile_bert_encoder
-
-
-def generate_fake_input(batch_size=1, seq_len=5, vocab_size=10000, seed=0):
-  """Generate consistent fake integer input sequences."""
-  np.random.seed(seed)
-  fake_input = []
-  for _ in range(batch_size):
-    fake_input.append([])
-    for _ in range(seq_len):
-      fake_input[-1].append(np.random.randint(0, vocab_size))
-  fake_input = np.asarray(fake_input)
-  return fake_input
-
-
-class MobileBertEncoderTest(parameterized.TestCase, tf.test.TestCase):
-
-  def test_embedding_layer_with_token_type(self):
-    layer = mobile_bert_layers.MobileBertEmbedding(10, 8, 2, 16)
-    input_seq = tf.Variable([[2, 3, 4, 5]])
-    token_type = tf.Variable([[0, 1, 1, 1]])
-    output = layer(input_seq, token_type)
-    output_shape = output.shape.as_list()
-    expected_shape = [1, 4, 16]
-    self.assertListEqual(output_shape, expected_shape, msg=None)
-
-  def test_embedding_layer_without_token_type(self):
-    layer = mobile_bert_layers.MobileBertEmbedding(10, 8, 2, 16)
-    input_seq = tf.Variable([[2, 3, 4, 5]])
-    output = layer(input_seq)
-    output_shape = output.shape.as_list()
-    expected_shape = [1, 4, 16]
-    self.assertListEqual(output_shape, expected_shape, msg=None)
-
-  def test_embedding_layer_get_config(self):
-    layer = mobile_bert_layers.MobileBertEmbedding(
-        word_vocab_size=16,
-        word_embed_size=32,
-        type_vocab_size=4,
-        output_embed_size=32,
-        max_sequence_length=32,
-        normalization_type='layer_norm',
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01),
-        dropout_rate=0.5)
-    layer_config = layer.get_config()
-    new_layer = mobile_bert_layers.MobileBertEmbedding.from_config(layer_config)
-    self.assertEqual(layer_config, new_layer.get_config())
-
-  def test_no_norm(self):
-    layer = mobile_bert_layers.NoNorm()
-    feature = tf.random.normal([2, 3, 4])
-    output = layer(feature)
-    output_shape = output.shape.as_list()
-    expected_shape = [2, 3, 4]
-    self.assertListEqual(output_shape, expected_shape, msg=None)
-
-  @parameterized.named_parameters(('with_kq_shared_bottleneck', False),
-                                  ('without_kq_shared_bottleneck', True))
-  def test_transfomer_kq_shared_bottleneck(self, is_kq_shared):
-    feature = tf.random.uniform([2, 3, 512])
-    layer = mobile_bert_layers.MobileBertTransformer(
-        key_query_shared_bottleneck=is_kq_shared)
-    output = layer(feature)
-    output_shape = output.shape.as_list()
-    expected_shape = [2, 3, 512]
-    self.assertListEqual(output_shape, expected_shape, msg=None)
-
-  def test_transfomer_with_mask(self):
-    feature = tf.random.uniform([2, 3, 512])
-    input_mask = [[[0., 0., 1.], [0., 0., 1.], [0., 0., 1.]],
-                  [[0., 1., 1.], [0., 1., 1.], [0., 1., 1.]]]
-    input_mask = np.asarray(input_mask)
-    layer = mobile_bert_layers.MobileBertTransformer()
-    output = layer(feature, input_mask)
-    output_shape = output.shape.as_list()
-    expected_shape = [2, 3, 512]
-    self.assertListEqual(output_shape, expected_shape, msg=None)
-
-  def test_transfomer_return_attention_score(self):
-    sequence_length = 5
-    num_attention_heads = 8
-    feature = tf.random.uniform([2, sequence_length, 512])
-    layer = mobile_bert_layers.MobileBertTransformer(
-        num_attention_heads=num_attention_heads)
-    _, attention_score = layer(feature, return_attention_scores=True)
-    expected_shape = [2, num_attention_heads, sequence_length, sequence_length]
-    self.assertListEqual(
-        attention_score.shape.as_list(), expected_shape, msg=None)
-
-  def test_transformer_get_config(self):
-    layer = mobile_bert_layers.MobileBertTransformer(
-        hidden_size=32,
-        num_attention_heads=2,
-        intermediate_size=48,
-        intermediate_act_fn='gelu',
-        hidden_dropout_prob=0.5,
-        attention_probs_dropout_prob=0.4,
-        intra_bottleneck_size=64,
-        use_bottleneck_attention=True,
-        key_query_shared_bottleneck=False,
-        num_feedforward_networks=2,
-        normalization_type='layer_norm',
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.01),
-        name='block')
-    layer_config = layer.get_config()
-    new_layer = mobile_bert_layers.MobileBertTransformer.from_config(
-        layer_config)
-    self.assertEqual(layer_config, new_layer.get_config())
-
-
-class MobileBertMaskedLMTest(tf.test.TestCase):
-
-  def create_layer(self,
-                   vocab_size,
-                   hidden_size,
-                   embedding_width,
-                   output='predictions',
-                   xformer_stack=None):
-    # First, create a transformer stack that we can use to get the LM's
-    # vocabulary weight.
-    if xformer_stack is None:
-      xformer_stack = mobile_bert_encoder.MobileBERTEncoder(
-          word_vocab_size=vocab_size,
-          num_blocks=1,
-          hidden_size=hidden_size,
-          num_attention_heads=4,
-          word_embed_size=embedding_width)
-
-    # Create a maskedLM from the transformer stack.
-    test_layer = mobile_bert_layers.MobileBertMaskedLM(
-        embedding_table=xformer_stack.get_embedding_table(), output=output)
-    return test_layer
-
-  def test_layer_creation(self):
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    embedding_width = 32
-    num_predictions = 21
-    test_layer = self.create_layer(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        embedding_width=embedding_width)
-
-    # Make sure that the output tensor of the masked LM is the right shape.
-    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
-    output = test_layer(lm_input_tensor, masked_positions=masked_positions)
-
-    expected_output_shape = [None, num_predictions, vocab_size]
-    self.assertEqual(expected_output_shape, output.shape.as_list())
-
-  def test_layer_invocation_with_external_logits(self):
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    embedding_width = 32
-    num_predictions = 21
-    xformer_stack = mobile_bert_encoder.MobileBERTEncoder(
-        word_vocab_size=vocab_size,
-        num_blocks=1,
-        hidden_size=hidden_size,
-        num_attention_heads=4,
-        word_embed_size=embedding_width)
-    test_layer = self.create_layer(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        embedding_width=embedding_width,
-        xformer_stack=xformer_stack,
-        output='predictions')
-    logit_layer = self.create_layer(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        embedding_width=embedding_width,
-        xformer_stack=xformer_stack,
-        output='logits')
-
-    # Create a model from the masked LM layer.
-    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
-    output = test_layer(lm_input_tensor, masked_positions)
-    logit_output = logit_layer(lm_input_tensor, masked_positions)
-    logit_output = tf.keras.layers.Activation(tf.nn.log_softmax)(logit_output)
-    logit_layer.set_weights(test_layer.get_weights())
-    model = tf.keras.Model([lm_input_tensor, masked_positions], output)
-    logits_model = tf.keras.Model(([lm_input_tensor, masked_positions]),
-                                  logit_output)
-
-    # Invoke the masked LM on some fake data to make sure there are no runtime
-    # errors in the code.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        sequence_length, size=(batch_size, num_predictions))
-    # ref_outputs = model.predict([lm_input_data, masked_position_data])
-    # outputs = logits_model.predict([lm_input_data, masked_position_data])
-    ref_outputs = model([lm_input_data, masked_position_data])
-    outputs = logits_model([lm_input_data, masked_position_data])
-
-    # Ensure that the tensor shapes are correct.
-    expected_output_shape = (batch_size, num_predictions, vocab_size)
-    self.assertEqual(expected_output_shape, ref_outputs.shape)
-    self.assertEqual(expected_output_shape, outputs.shape)
-    self.assertAllClose(ref_outputs, outputs)
-
-  def test_layer_invocation(self):
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    embedding_width = 32
-    num_predictions = 21
-    test_layer = self.create_layer(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        embedding_width=embedding_width)
-
-    # Create a model from the masked LM layer.
-    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
-    output = test_layer(lm_input_tensor, masked_positions)
-    model = tf.keras.Model([lm_input_tensor, masked_positions], output)
-
-    # Invoke the masked LM on some fake data to make sure there are no runtime
-    # errors in the code.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
-    _ = model.predict([lm_input_data, masked_position_data])
-
-  def test_unknown_output_type_fails(self):
-    with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
-      _ = self.create_layer(
-          vocab_size=8, hidden_size=8, embedding_width=4, output='bad')
-
-  def test_hidden_size_smaller_than_embedding_width(self):
-    hidden_size = 8
-    sequence_length = 32
-    num_predictions = 20
-    with self.assertRaisesRegex(
-        ValueError, 'hidden size 8 cannot be smaller than embedding width 16.'):
-      test_layer = self.create_layer(
-          vocab_size=8, hidden_size=8, embedding_width=16)
-      lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-      masked_positions = tf.keras.Input(
-          shape=(num_predictions,), dtype=tf.int32)
-      _ = test_layer(lm_input_tensor, masked_positions)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/multi_channel_attention.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/multi_channel_attention.py
deleted file mode 100644
index c7ed651c6eb2c2c1c62f6e273b113a2b8dedf9a8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/multi_channel_attention.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Multi-channel Attention."""
-# pylint: disable=g-classes-have-attributes
-
-import math
-
-import tensorflow as tf
-from official.modeling import tf_utils
-from official.nlp.modeling.layers import masked_softmax
-
-
-class VotingAttention(tf.keras.layers.Layer):
-  """Voting Attention layer.
-
-  Args:
-    num_heads: The number of attention heads.
-    head_size: Per-head hidden size.
-    kernel_initializer: Initializer for dense layer kernels.
-    bias_initializer: Initializer for dense layer biases.
-    kernel_regularizer: Regularizer for dense layer kernels.
-    bias_regularizer: Regularizer for dense layer biases.
-    activity_regularizer: Regularizer for dense layer activity.
-    kernel_constraint: Constraint for dense layer kernels.
-    bias_constraint: Constraint for dense layer kernels.
-  """
-
-  def __init__(self,
-               num_heads,
-               head_size,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(VotingAttention, self).__init__(**kwargs)
-    self._num_heads = num_heads
-    self._head_size = head_size
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
-
-  def build(self, unused_input_shapes):
-    common_kwargs = dict(
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint)
-    self._query_dense = tf.keras.layers.experimental.EinsumDense(
-        "BAE,ENH->BANH",
-        output_shape=(None, self._num_heads, self._head_size),
-        bias_axes="NH",
-        name="query",
-        **common_kwargs)
-    self._key_dense = tf.keras.layers.experimental.EinsumDense(
-        "BAE,ENH->BANH",
-        output_shape=(None, self._num_heads, self._head_size),
-        bias_axes="NH",
-        name="key",
-        **common_kwargs)
-    super(VotingAttention, self).build(unused_input_shapes)
-
-  def call(self, encoder_outputs, doc_attention_mask):
-    num_docs = tf_utils.get_shape_list(encoder_outputs, expected_rank=[4])[1]
-    cls_embeddings = encoder_outputs[:, :, 0, :]
-    key = self._key_dense(cls_embeddings)
-    query = self._query_dense(cls_embeddings)
-    doc_attention_mask = tf.cast(doc_attention_mask, tf.float32)
-
-    key = tf.einsum("BANH,BA->BANH", key, doc_attention_mask)
-    query = tf.einsum("BANH,BA->BANH", query, doc_attention_mask)
-    attention_matrix = tf.einsum("BXNH,BYNH->BNXY", query, key)
-    mask = tf.ones([num_docs, num_docs])
-    mask = tf.linalg.set_diag(mask, tf.zeros(num_docs))
-    attention_matrix = tf.einsum("BNXY,XY->BNXY", attention_matrix, mask)
-    doc_attention_probs = tf.einsum("BNAY->BNA", attention_matrix)
-    doc_attention_probs = tf.einsum("BNA->BA", doc_attention_probs)
-    infadder = (1.0 - doc_attention_mask) * -100000.0
-    return tf.nn.softmax(doc_attention_probs + infadder)
-
-
-class MultiChannelAttention(tf.keras.layers.MultiHeadAttention):
-  """Multi-channel Attention layer.
-
-  Introduced in, [Generating Representative Headlines for News Stories
-  ](https://arxiv.org/abs/2001.09386). Expects multiple cross-attention
-  target sequences.
-
-  Call args:
-    query: Query `Tensor` of shape `[B, T, dim]`.
-    value: Value `Tensor` of shape `[B, A, S, dim]`, where A denotes the
-    context_attention_weights: Context weights of shape `[B, N, T, A]`, where N
-      is the number of attention heads. Combines multi-channel sources
-      context tensors according to the distribution among channels.
-    key: Optional key `Tensor` of shape `[B, A, S, dim]`. If not given, will use
-      `value` for both `key` and `value`, which is the most common case.
-    attention_mask: A boolean mask of shape `[B, T, S]`, that prevents attention
-      to certain positions.
-  """
-
-  def _build_attention(self, rank):
-    super(MultiChannelAttention, self)._build_attention(rank)
-    self._masked_softmax = masked_softmax.MaskedSoftmax(mask_expansion_axes=[2])
-
-  def call(self,
-           query,
-           value,
-           key=None,
-           context_attention_weights=None,
-           attention_mask=None):
-    if not self._built_from_signature:
-      self._build_from_signature(query, value, key=key)
-    if key is None:
-      key = value
-
-    # Scalar dimensions referenced here:
-    #   B = batch size (number of stories)
-    #   A = num_docs (number of docs)
-    #   F = target sequence length
-    #   T = source sequence length
-    #   N = `num_attention_heads`
-    #   H = `size_per_head`
-    # `query_tensor` = [B, F, N ,H]
-    query_tensor = self._query_dense(query)
-
-    # `key_tensor` = [B, A, T, N, H]
-    key_tensor = self._key_dense(key)
-
-    # `value_tensor` = [B, A, T, N, H]
-    value_tensor = self._value_dense(value)
-
-    # Take the dot product between "query" and "key" to get the raw
-    # attention scores.
-    attention_scores = tf.einsum("BATNH,BFNH->BANFT", key_tensor, query_tensor)
-    attention_scores = tf.multiply(attention_scores,
-                                   1.0 / math.sqrt(float(self._key_dim)))
-
-    # Normalize the attention scores to probabilities.
-    # `attention_probs` = [B, A, N, F, T]
-    attention_probs = self._masked_softmax(attention_scores, attention_mask)
-
-    # This is actually dropping out entire tokens to attend to, which might
-    # seem a bit unusual, but is taken from the original Transformer paper.
-    attention_probs = self._dropout_layer(attention_probs)
-
-    # `context_layer` = [B, F, N, H]
-    context_layer = tf.einsum("BANFT,BATNH->BAFNH", attention_probs,
-                              value_tensor)
-    attention_output = tf.einsum("BNFA,BAFNH->BFNH", context_attention_weights,
-                                 context_layer)
-    attention_output = self._output_dense(attention_output)
-    return attention_output
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/multi_channel_attention_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/multi_channel_attention_test.py
deleted file mode 100644
index 0a3c44185342e3ad1125c48f6e23cb7242da1a1a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/multi_channel_attention_test.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for nlp.nhnet.multi_channel_attention."""
-
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.modeling.layers import multi_channel_attention
-
-
-class MultiChannelAttentionTest(tf.test.TestCase):
-
-  def test_doc_attention(self):
-    num_heads = 2
-    doc_attention = multi_channel_attention.VotingAttention(
-        num_heads, head_size=8)
-    num_docs = 3
-    inputs = np.zeros((2, num_docs, 10, 16), dtype=np.float32)
-    doc_mask = np.zeros((2, num_docs), dtype=np.float32)
-    outputs = doc_attention(inputs, doc_mask)
-    self.assertEqual(outputs.shape, (2, num_docs))
-
-  def test_multi_channel_attention(self):
-    num_heads = 2
-    num_docs = 5
-    attention_layer = multi_channel_attention.MultiChannelAttention(
-        num_heads, key_dim=2)
-
-    from_data = 10 * np.random.random_sample((3, 4, 8))
-    to_data = 10 * np.random.random_sample((3, num_docs, 2, 8))
-    mask_data = np.random.randint(2, size=(3, num_docs, 4, 2))
-    doc_probs = np.random.randint(
-        2, size=(3, num_heads, 4, num_docs)).astype(float)
-    outputs = attention_layer(
-        query=from_data,
-        value=to_data,
-        context_attention_weights=doc_probs,
-        attention_mask=mask_data)
-    self.assertEqual(outputs.shape, (3, 4, 8))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/on_device_embedding.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/on_device_embedding.py
deleted file mode 100644
index 3fd4eb50116ec1526da6c036009c47057e5c46f2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/on_device_embedding.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based one-hot embedding layer."""
-# pylint: disable=g-classes-have-attributes
-
-from official.nlp import keras_nlp
-
-
-OnDeviceEmbedding = keras_nlp.layers.OnDeviceEmbedding
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/position_embedding.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/position_embedding.py
deleted file mode 100644
index e492c64368249e902491b755bec307ea5b04c74c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/position_embedding.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based positional embedding layer."""
-# pylint: disable=g-classes-have-attributes
-import math
-from typing import Optional
-
-import tensorflow as tf
-
-from official.modeling import tf_utils
-
-Initializer = tf.keras.initializers.Initializer
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class RelativePositionEmbedding(tf.keras.layers.Layer):
-  """Creates a positional embedding.
-
-  This layer calculates the position encoding as a mix of sine and cosine
-  functions with geometrically increasing wavelengths. Defined and formulized in
-   "Attention is All You Need", section 3.5.
-  (https://arxiv.org/abs/1706.03762).
-
-  Args:
-    hidden_size: Size of the hidden layer.
-    min_timescale: Minimum scale that will be applied at each position
-    max_timescale: Maximum scale that will be applied at each position.
-  """
-
-  def __init__(self,
-               hidden_size: int,
-               min_timescale: float = 1.0,
-               max_timescale: float = 1.0e4,
-               **kwargs):
-    # We need to have a default dtype of float32, since the inputs (which Keras
-    # usually uses to infer the dtype) will always be int32.
-    # We compute the positional encoding in float32 even if the model uses
-    # float16, as many of the ops used, like log and exp, are numerically
-    # unstable in float16.
-    if "dtype" not in kwargs:
-      kwargs["dtype"] = "float32"
-
-    super().__init__(**kwargs)
-    self._hidden_size = hidden_size
-    self._min_timescale = min_timescale
-    self._max_timescale = max_timescale
-
-  def get_config(self):
-    config = {
-        "hidden_size": self._hidden_size,
-        "min_timescale": self._min_timescale,
-        "max_timescale": self._max_timescale,
-    }
-    base_config = super(RelativePositionEmbedding, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs, length=None):
-    """Implements call() for the layer.
-
-    Args:
-      inputs: An tensor whose second dimension will be used as `length`. If
-        `None`, the other `length` argument must be specified.
-      length: An optional integer specifying the number of positions. If both
-        `inputs` and `length` are spcified, `length` must be equal to the second
-        dimension of `inputs`.
-
-    Returns:
-      A tensor in shape of `(length, hidden_size)`.
-    """
-    if inputs is None and length is None:
-      raise ValueError("If inputs is None, `length` must be set in "
-                       "RelativePositionEmbedding().")
-    if inputs is not None:
-      input_shape = tf_utils.get_shape_list(inputs)
-      if length is not None and length != input_shape[1]:
-        raise ValueError(
-            "If inputs is not None, `length` must equal to input_shape[1].")
-      length = input_shape[1]
-    position = tf.cast(tf.range(length), tf.float32)
-    num_timescales = self._hidden_size // 2
-    min_timescale, max_timescale = self._min_timescale, self._max_timescale
-    log_timescale_increment = (
-        math.log(float(max_timescale) / float(min_timescale)) /
-        (tf.cast(num_timescales, tf.float32) - 1))
-    inv_timescales = min_timescale * tf.exp(
-        tf.cast(tf.range(num_timescales), tf.float32) *
-        -log_timescale_increment)
-    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(
-        inv_timescales, 0)
-    position_embeddings = tf.concat(
-        [tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
-    return position_embeddings
-
-
-def _relative_position_bucket(relative_position,
-                              bidirectional=True,
-                              num_buckets=32,
-                              max_distance=128):
-  """Translate relative position to a bucket number for relative attention.
-
-  The relative position is defined as memory_position - query_position, i.e.
-  the distance in tokens from the attending position to the attended-to
-  position.
-
-  If `bidirectional=False`, then positive relative positions are invalid.
-
-  We use smaller buckets for small absolute relative_position and larger
-  buckets for larger absolute relative_positions.
-
-  All relative positions >=max_distance map to the same bucket.
-
-  All relative positions <=-max_distance map to the same bucket.
-
-  This should allow for more graceful generalization to longer sequences
-  than the model has been trained on.
-
-  Args:
-    relative_position: An int32 Tensor
-    bidirectional: A boolean - whether the attention is bidirectional
-    num_buckets: An integer
-    max_distance: An integer
-
-  Returns:
-    A Tensor with the same shape as relative_position, containing int32
-    values in the range [0, num_buckets)
-  """
-  ret = 0
-  n = -relative_position
-  if bidirectional:
-    num_buckets //= 2
-    ret += tf.cast(tf.math.less(n, 0), tf.int32) * num_buckets
-    n = tf.math.abs(n)
-  else:
-    n = tf.math.maximum(n, 0)
-  # now n is in the range [0, inf)
-  max_exact = num_buckets // 2
-  is_small = tf.math.less(n, max_exact)
-  val_if_large = max_exact + tf.dtypes.cast(
-      tf.math.log(tf.cast(n, tf.float32) / max_exact) /
-      math.log(max_distance / max_exact) * (num_buckets - max_exact),
-      tf.int32,
-  )
-  val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
-  ret += tf.where(is_small, n, val_if_large)
-  return ret
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class RelativePositionBias(tf.keras.layers.Layer):
-  """Relative position embedding via per-head bias in T5 style.
-
-  Reference implementation in MeshTF:
-  https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L1000
-
-  This layer implements the relative position bias used in "Exploring the Limits
-  of Transfer Learning with a Unified Text-to-Text Transformer"
-  (https://arxiv.org/abs/1910.10683)
-  """
-
-  def __init__(self,
-               num_heads: int,
-               relative_attention_num_buckets: int = 32,
-               relative_attention_max_distance: int = 128,
-               bidirectional: bool = True,
-               embeddings_initializer: Optional[Initializer] = None,
-               **kwargs):
-    super().__init__(**kwargs)
-    self.num_heads = num_heads
-    self.relative_attention_num_buckets = relative_attention_num_buckets
-    self.bidirectional = bidirectional
-    self.relative_attention_max_distance = relative_attention_max_distance
-    if embeddings_initializer:
-      self._embed_init = embeddings_initializer
-    else:
-      self._embed_init = tf.keras.initializers.TruncatedNormal(stddev=1.0)
-    with tf.name_scope(self.name):
-      self._relative_attention_bias = self.add_weight(
-          "rel_embedding",
-          shape=[self.relative_attention_num_buckets, self.num_heads],
-          initializer=self._embed_init,
-          dtype=self.dtype,
-          trainable=True)
-
-  def get_config(self):
-    config = {
-        "num_heads":
-            self.num_heads,
-        "relative_attention_num_buckets":
-            self.relative_attention_num_buckets,
-        "relative_attention_max_distance":
-            self.relative_attention_max_distance,
-        "bidirectional":
-            self.bidirectional,
-        "embeddings_initializer":
-            tf.keras.initializers.serialize(self._embed_init),
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, query: tf.Tensor, key: tf.Tensor):
-    """Implements the forward pass.
-
-    Args:
-      query: query input tensor shape [batch, query length, hidden size].
-      key: key input tensor shape [batch, key length, hidden size].
-
-    Returns:
-      A tensor in shape of [batch, heads, query length, key length].
-    """
-    batch_size, qlen = tf_utils.get_shape_list(query)[:2]
-    klen = tf_utils.get_shape_list(key)[1]
-    context_position = tf.range(qlen)[:, None]
-    memory_position = tf.range(klen)[None, :]
-    relative_position = memory_position - context_position
-    rp_bucket = _relative_position_bucket(
-        relative_position,
-        bidirectional=self.bidirectional,
-        num_buckets=self.relative_attention_num_buckets,
-        max_distance=self.relative_attention_max_distance)
-    values = tf.nn.embedding_lookup(self._relative_attention_bias, rp_bucket)
-    values = tf.expand_dims(
-        tf.transpose(values, [2, 0, 1]),
-        axis=0)  # shape (1, num_heads, qlen, klen)
-    values = tf.tile(values, [batch_size, 1, 1, 1])
-    return values
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/position_embedding_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/position_embedding_test.py
deleted file mode 100644
index 7ebe1667b83798cb4f8e88b6051638f7a854c28d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/position_embedding_test.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for Keras-based positional embedding layer."""
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import position_embedding
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class RelativePositionEmbeddingLayerTest(keras_parameterized.TestCase):
-
-  def test_relative_tensor_input(self):
-    hidden_size = 8
-    test_layer = position_embedding.RelativePositionEmbedding(
-        hidden_size=hidden_size)
-
-    # create a 3-dimensional input for test_layer to infer length as 1.
-    input_tensor = tf.constant([[[0] * hidden_size]])
-    output_tensor = test_layer(input_tensor)
-
-    # expected output is the theoretical result of the input based on
-    # sine cosine relative position embedding formula.
-    expected_output_tensor = tf.constant([[0, 0, 0, 0, 1, 1, 1, 1]])
-    self.assertAllEqual(output_tensor, expected_output_tensor)
-
-  def test_relative_length_input(self):
-    hidden_size = 8
-
-    # When we do not have tensor as input, we explicitly specify length
-    # value when initializing test_layer.
-    test_layer = position_embedding.RelativePositionEmbedding(
-        hidden_size=hidden_size)
-    input_tensor = None
-    output_tensor = test_layer(input_tensor, length=1)
-
-    # expected output is the theoretical result of the input based on
-    # sine cosine relative position embedding formula.
-    expected_output_tensor = tf.constant([[0, 0, 0, 0, 1, 1, 1, 1]])
-    self.assertAllEqual(output_tensor, expected_output_tensor)
-
-
-@keras_parameterized.run_all_keras_modes
-class RelativePositionBiasTest(keras_parameterized.TestCase):
-
-  @parameterized.named_parameters(("bidirectional", True),
-                                  ("unidirectional", False))
-  def test_relative_position_bias(self, bidirectional):
-    query = tf.zeros((4, 4, 2))
-    key = tf.zeros((4, 2, 2))
-    l = position_embedding.RelativePositionBias(
-        num_heads=3,
-        bidirectional=bidirectional,
-        name="foo")
-    self.assertEqual(l(query, key).shape, (4, 3, 4, 2))
-    self.assertLen(l.trainable_variables, 1)
-    self.assertEqual(l.trainable_variables[0].name, "foo/rel_embedding:0")
-
-  def test_relative_position_bucket(self):
-    context_position = tf.range(3)[:, None]
-    memory_position = tf.range(2)[None, :]
-    relative_position = memory_position - context_position
-    outputs = position_embedding._relative_position_bucket(relative_position)
-    self.assertAllEqual(outputs.numpy(), np.array([[0, 17], [1, 0], [2, 1]]))
-    outputs = position_embedding._relative_position_bucket(
-        relative_position, bidirectional=False)
-    self.assertAllEqual(outputs.numpy(), np.array([[0, 0], [1, 0], [2, 1]]))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/relative_attention.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/relative_attention.py
deleted file mode 100644
index ef88dd06e0446e000fe0585d23f23719a8021c48..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/relative_attention.py
+++ /dev/null
@@ -1,515 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based relative attention layers."""
-import math
-import string
-import tensorflow as tf
-
-_CHR_IDX = string.ascii_lowercase
-
-
-def _build_proj_equation(free_dims, bound_dims, output_dims):
-  """Builds an einsum equation for projections inside multi-head attention."""
-  input_str = ""
-  kernel_str = ""
-  output_str = ""
-  bias_axes = ""
-  letter_offset = 0
-  for i in range(free_dims):
-    char = _CHR_IDX[i + letter_offset]
-    input_str += char
-    output_str += char
-
-  letter_offset += free_dims
-  for i in range(bound_dims):
-    char = _CHR_IDX[i + letter_offset]
-    input_str += char
-    kernel_str += char
-
-  letter_offset += bound_dims
-  for i in range(output_dims):
-    char = _CHR_IDX[i + letter_offset]
-    kernel_str += char
-    output_str += char
-    bias_axes += char
-  equation = "%s,%s->%s" % (input_str, kernel_str, output_str)
-
-  return equation, bias_axes, len(output_str)
-
-
-def _get_output_shape(output_rank, known_last_dims):
-  return [None] * (output_rank - len(known_last_dims)) + list(known_last_dims)
-
-
-def _rel_shift(x, klen=-1):
-  """Performs relative shift to form the relative attention score."""
-
-  x = tf.transpose(x, perm=[2, 3, 0, 1])
-  x_size = tf.shape(x)
-
-  x = tf.reshape(x, [x_size[1], x_size[0], x_size[2], x_size[3]])
-  x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
-  x = tf.reshape(x, [x_size[0], x_size[1] - 1, x_size[2], x_size[3]])
-  x = tf.slice(x, [0, 0, 0, 0], [-1, klen, -1, -1])
-
-  x = tf.transpose(x, perm=[2, 3, 0, 1])
-
-  return x
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class MultiHeadRelativeAttention(tf.keras.layers.MultiHeadAttention):
-  """A multi-head attention layer with relative attention + position encoding.
-
-  This layer shares the same input/output projections as the common
-  `tf.keras.layers.MultiHeadAttention` layer.
-
-  When it calculates attention logits, position encoding is projected to form
-  relative keys. The logits are composed by shifted relative logits and content
-  logits.
-
-  **Note: This layer is currently experimental.
-
-  Attributes:
-    kernel_initializer: The kernel initializer. Defaults to variance_scaling.
-
-  Call args:
-    query: Query `Tensor` of shape `[B, T, dim]`.
-    value: Value `Tensor` of shape `[B, S, dim]`.
-    content_attention_bias: Bias `Tensor` for content based attention of shape
-      `[num_heads, dim]`.
-    positional_attention_bias: Bias `Tensor` for position based attention of
-      shape `[num_heads, dim]`.
-    key: Optional key `Tensor` of shape `[B, S, dim]`. If not given, will use
-      `value` for both `key` and `value`, which is the most common case.
-    relative_position_encoding: Relative positional encoding `Tensor` of shape
-      `[B, L, dim]`.
-    segment_matrix: Optional `Tensor` representing segmentation IDs used in
-      XLNet of shape `[B, S, S + M]`.
-    segment_encoding: Optional `Tensor` representing the segmentation
-      encoding as used in XLNet of shape `[2, num_heads, dim]`.
-    segment_attention_bias: Optional trainable bias parameter added to the
-      query had when calculating the segment-based attention score used in
-      XLNet of shape `[num_heads, dim]`.
-    state: Optional `Tensor` of shape `[B, M, E]` where M is the length of the
-      state or memory.
-      If passed, this is also attended over as in Transformer XL.
-    attention_mask: A boolean mask of shape `[B, T, S]` that prevents attention
-      to certain positions.
-  """
-
-  def __init__(self,
-               kernel_initializer="variance_scaling",
-               **kwargs):
-    super().__init__(kernel_initializer=kernel_initializer,
-                     **kwargs)
-
-  def _build_from_signature(self, query, value, key=None):
-    super(MultiHeadRelativeAttention, self)._build_from_signature(
-        query=query,
-        value=value,
-        key=key)
-    if hasattr(value, "shape"):
-      value_shape = tf.TensorShape(value.shape)
-    else:
-      value_shape = value
-    if key is None:
-      key_shape = value_shape
-    elif hasattr(key, "shape"):
-      key_shape = tf.TensorShape(key.shape)
-    else:
-      key_shape = key
-
-    common_kwargs = dict(
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint)
-
-    with tf.init_scope():
-      einsum_equation, _, output_rank = _build_proj_equation(
-          key_shape.rank - 1, bound_dims=1, output_dims=2)
-      self._encoding_dense = tf.keras.layers.experimental.EinsumDense(
-          einsum_equation,
-          output_shape=_get_output_shape(output_rank - 1,
-                                         [self._num_heads, self._key_dim]),
-          bias_axes=None,
-          name="encoding",
-          **common_kwargs)
-
-  def compute_attention(self,
-                        query,
-                        key,
-                        value,
-                        position,
-                        content_attention_bias,
-                        positional_attention_bias,
-                        segment_matrix=None,
-                        segment_encoding=None,
-                        segment_attention_bias=None,
-                        attention_mask=None):
-    """Computes the attention.
-
-    This function defines the computation inside `call` with projected
-    multihead Q, K, V, R inputs.
-
-    Args:
-      query: Projected query `Tensor` of shape `[B, T, N, key_dim]`.
-      key: Projected key `Tensor` of shape `[B, S + M, N, key_dim]`.
-      value: Projected value `Tensor` of shape `[B, S + M, N, key_dim]`.
-      position: Projected position `Tensor` of shape `[B, L, N, key_dim]`.
-      content_attention_bias: Trainable bias parameter added to the query head
-        when calculating the content-based attention score.
-      positional_attention_bias: Trainable bias parameter added to the query
-        head when calculating the position-based attention score.
-      segment_matrix: Optional `Tensor` representing segmentation IDs used in
-        XLNet.
-      segment_encoding: Optional trainable `Tensor` representing the
-        segmentation encoding as used in XLNet.
-      segment_attention_bias: Optional trainable bias parameter added to the
-        query had when calculating the segment-based attention score used in
-        XLNet.
-      attention_mask: (default None) Optional mask that is added to attention
-        logits. If state is not None, the mask source sequence dimension should
-        extend M.
-
-    Returns:
-      attention_output: Multi-headed output of attention computation of shape
-        `[B, S, N, key_dim]`.
-
-    """
-    content_attention = tf.einsum(self._dot_product_equation,
-                                  key,
-                                  query + content_attention_bias)
-    positional_attention = tf.einsum(self._dot_product_equation,
-                                     position,
-                                     query + positional_attention_bias)
-    positional_attention = _rel_shift(
-        positional_attention, klen=tf.shape(content_attention)[3])
-
-    if segment_matrix is not None:
-      segment_attention = tf.einsum("bind,snd->bnis",
-                                    query + segment_attention_bias,
-                                    segment_encoding)
-      target_shape = tf.shape(positional_attention)
-      segment_attention = tf.where(
-          tf.broadcast_to(tf.expand_dims(segment_matrix, 1), target_shape),
-          tf.broadcast_to(segment_attention[:, :, :, 1:], target_shape),
-          tf.broadcast_to(segment_attention[:, :, :, :1], target_shape))
-      attention_sum = (
-          content_attention + positional_attention + segment_attention)
-    else:
-      attention_sum = content_attention + positional_attention
-
-    attention_scores = tf.multiply(
-        attention_sum, 1.0 / math.sqrt(float(self._key_dim)))
-
-    attention_scores = self._masked_softmax(attention_scores, attention_mask)
-
-    attention_output = self._dropout_layer(attention_scores)
-
-    attention_output = tf.einsum(self._combine_equation,
-                                 attention_output,
-                                 value)
-    return attention_output
-
-  def call(self,
-           query,
-           value,
-           content_attention_bias,
-           positional_attention_bias,
-           key=None,
-           relative_position_encoding=None,
-           segment_matrix=None,
-           segment_encoding=None,
-           segment_attention_bias=None,
-           state=None,
-           attention_mask=None):
-    """Compute multi-head relative attention over inputs.
-
-    Size glossary:
-      * Number of heads (H): the number of attention heads.
-      * Value size (V): the size of each value embedding per head.
-      * Key size (K): the size of each key embedding per head. Equally, the size
-        of each query embedding per head. Typically K <= V.
-      * Batch dimensions (B).
-      * Query (target) attention axes shape (T).
-      * Value (source) attention axes shape (S), the rank must match the target.
-      * Encoding length (L): The relative positional encoding length.
-
-    Args:
-      query: attention input.
-      value: attention input.
-      content_attention_bias: A trainable bias parameter added to the query
-        head when calculating the content-based attention score.
-      positional_attention_bias: A trainable bias parameter added to the query
-        head when calculating the position-based attention score.
-      key: attention input.
-      relative_position_encoding: relative positional encoding for key and
-        value.
-      segment_matrix: Optional `Tensor` representing segmentation IDs used in
-        XLNet.
-      segment_encoding: Optional `Tensor` representing the segmentation
-        encoding as used in XLNet.
-      segment_attention_bias: Optional trainable bias parameter added to the
-        query had when calculating the segment-based attention score used in
-        XLNet.
-      state: (default None) optional state. If passed, this is also attended
-        over as in TransformerXL.
-      attention_mask: (default None) Optional mask that is added to attention
-        logits. If state is not None, the mask source sequence dimension should
-        extend M.
-
-    Returns:
-      attention_output: The result of the computation, of shape [B, T, E],
-        where `T` is for target sequence shapes and `E` is the query input last
-        dimension if `output_shape` is `None`. Otherwise, the multi-head outputs
-        are projected to the shape specified by `output_shape`.
-    """
-    if not self._built_from_signature:
-      self._build_from_signature(query, value, key=key)
-    if key is None:
-      key = value
-    if state is not None and state.shape.ndims > 1:
-      value = tf.concat([state, value], 1)
-      key = tf.concat([state, key], 1)
-
-    # `query` = [B, T, N ,H]
-    query = self._query_dense(query)
-
-    # `key` = [B, S + M, N, H]
-    key = self._key_dense(key)
-
-    # `value` = [B, S + M, N, H]
-    value = self._value_dense(value)
-
-    # `position` = [B, L, N, H]
-    position = self._encoding_dense(relative_position_encoding)
-
-    attention_output = self.compute_attention(
-        query=query,
-        key=key,
-        value=value,
-        position=position,
-        content_attention_bias=content_attention_bias,
-        positional_attention_bias=positional_attention_bias,
-        segment_matrix=segment_matrix,
-        segment_encoding=segment_encoding,
-        segment_attention_bias=segment_attention_bias,
-        attention_mask=attention_mask)
-
-    # `attention_output` = [B, S, N, H]
-    attention_output = self._output_dense(attention_output)
-
-    return attention_output
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class TwoStreamRelativeAttention(MultiHeadRelativeAttention):
-  """Two-stream relative self-attention for XLNet.
-
-  In XLNet, each token has two associated vectors at each self-attention layer,
-  the content stream (h) and the query stream (g).
-
-  The content stream is the self-attention stream as in Transformer XL and
-  represents the context and content (the token itself).
-
-  The query stream only has access to contextual information and the position,
-  but not the content.
-
-  This layer shares the same build signature as
-  `tf.keras.layers.MultiHeadAttention` but has different input/output
-  projections.
-
-  **Note: This layer is currently experimental.
-
-  Call args:
-    content_stream: `Tensor` of shape `[B, T, dim]`.
-    content_attention_bias: Bias `Tensor` for content based attention of shape
-      `[num_heads, dim]`.
-    positional_attention_bias: Bias `Tensor` for position based attention of
-      shape `[num_heads, dim]`.
-    query_stream: `Tensor` of shape `[B, P, dim]`.
-    target_mapping: `Tensor` of shape `[B, P, S]`.
-    relative_position_encoding: Relative positional encoding `Tensor` of shape
-      `[B, L, dim]`.
-    segment_matrix: Optional `Tensor` representing segmentation IDs used in
-      XLNet of shape `[B, S, S + M]`.
-    segment_encoding: Optional `Tensor` representing the segmentation
-      encoding as used in XLNet of shape `[2, num_heads, dim]`.
-    segment_attention_bias: Optional trainable bias parameter added to the
-      query had when calculating the segment-based attention score used in
-      XLNet of shape `[num_heads, dim]`.
-    state: Optional `Tensor` of shape [B, M, E] where M is the length of the
-      state or memory.
-      If passed, this is also attended over as in Transformer XL.
-    content_attention_mask: a boolean mask of shape `[B, T, S]` that
-      prevents attention to certain positions for content attention computation.
-    query_attention_mask: a boolean mask of shape `[B, T, S]` that
-      prevents attention to certain position for query attention computation.
-  """
-
-  def call(self,
-           content_stream,
-           content_attention_bias,
-           positional_attention_bias,
-           query_stream,
-           relative_position_encoding,
-           target_mapping=None,
-           segment_matrix=None,
-           segment_encoding=None,
-           segment_attention_bias=None,
-           state=None,
-           content_attention_mask=None,
-           query_attention_mask=None):
-    """Compute multi-head relative attention over inputs.
-
-    Size glossary:
-      * Number of heads (H): the number of attention heads.
-      * Value size (V): the size of each value embedding per head.
-      * Key size (K): the size of each key embedding per head. Equally, the size
-        of each query embedding per head. Typically K <= V.
-      * Number of predictions (P): the number of predictions.
-      * Batch dimensions (B).
-      * Query (target) attention axes shape (T).
-      * Value (source) attention axes shape (S), the rank must match the target.
-      * Encoding length (L): The relative positional encoding length.
-
-    Args:
-      content_stream: The content representation, commonly referred to as h.
-        This serves a similar role to the standard hidden states in
-        Transformer-XL.
-      content_attention_bias: A trainable bias parameter added to the query
-        head when calculating the content-based attention score.
-      positional_attention_bias: A trainable bias parameter added to the query
-        head when calculating the position-based attention score.
-      query_stream: The query representation, commonly referred to as g.
-        This only has access to contextual information and position, but not
-        content. If not provided, then this is MultiHeadRelativeAttention with
-        self-attention.
-      relative_position_encoding: relative positional encoding for key and
-        value.
-      target_mapping: Optional `Tensor` representing the target mapping used
-        in partial prediction.
-      segment_matrix: Optional `Tensor` representing segmentation IDs used in
-        XLNet.
-      segment_encoding: Optional `Tensor` representing the segmentation
-        encoding as used in XLNet.
-      segment_attention_bias: Optional trainable bias parameter added to the
-        query head when calculating the segment-based attention score.
-      state: (default None) optional state. If passed, this is also attended
-        over as in TransformerXL and XLNet.
-      content_attention_mask: (default None) Optional mask that is added to
-        content attention logits. If state is not None, the mask source sequence
-        dimension should extend M.
-      query_attention_mask: (default None) Optional mask that is added to
-        query attention logits. If state is not None, the mask source sequence
-        dimension should extend M.
-
-    Returns:
-      content_attention_output, query_attention_output: the results of the
-        computation, both of shape [B, T, E]. `T` is for target sequence shapes,
-        `E` is the query input last dimension if `output_shape` is `None`.
-        Otherwise, the multi-head outputs are projected to the shape specified
-        by `output_shape`.
-    """
-    if not self._built_from_signature:
-      self._build_from_signature(content_stream, content_stream, content_stream)
-    if state is not None and state.shape.ndims > 1:
-      content_and_memory_stream = tf.concat([state, content_stream], 1)
-    else:
-      content_and_memory_stream = content_stream
-
-    # `query` = [B, T, N, H]
-    query = self._query_dense(content_stream)
-
-    # `key` = [B, S + M, N, H]
-    key = self._key_dense(content_and_memory_stream)
-
-    # `value` = [B, S + M, N, H]
-    value = self._value_dense(content_and_memory_stream)
-
-    # `position` = [B, L, N, H]
-    position = self._encoding_dense(relative_position_encoding)
-
-    content_attention_output = self.compute_attention(
-        query=query,
-        key=key,
-        value=value,
-        position=position,
-        content_attention_bias=content_attention_bias,
-        positional_attention_bias=positional_attention_bias,
-        segment_matrix=segment_matrix,
-        segment_encoding=segment_encoding,
-        segment_attention_bias=segment_attention_bias,
-        attention_mask=content_attention_mask)
-
-    # `content_attention_output` = [B, S, N, H]
-    content_attention_output = self._output_dense(content_attention_output)
-
-    query_attention_output = None
-    if query_stream is not None:
-      query = self._query_dense(query_stream)
-      if target_mapping is not None:
-        query = tf.einsum("bmnd,bml->blnd", query, target_mapping)
-        query_attention_output = self.compute_attention(
-            query=query,
-            key=key,
-            value=value,
-            position=position,
-            content_attention_bias=content_attention_bias,
-            positional_attention_bias=positional_attention_bias,
-            segment_matrix=segment_matrix,
-            segment_encoding=segment_encoding,
-            segment_attention_bias=segment_attention_bias,
-            attention_mask=query_attention_mask)
-        query_attention_output = tf.einsum("blnd,bml->bmnd",
-                                           query_attention_output,
-                                           target_mapping)
-      else:
-        query_attention_output = self.compute_attention(
-            query=query,
-            key=key,
-            value=value,
-            position=position,
-            content_attention_bias=content_attention_bias,
-            positional_attention_bias=positional_attention_bias,
-            segment_matrix=segment_matrix,
-            segment_encoding=segment_encoding,
-            segment_attention_bias=segment_attention_bias,
-            attention_mask=query_attention_mask)
-      query_attention_output = self._output_dense(query_attention_output)
-
-    return content_attention_output, query_attention_output
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/relative_attention_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/relative_attention_test.py
deleted file mode 100644
index b1e95411c0ce92223a641715fbae7f91b42f828b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/relative_attention_test.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for the attention layer."""
-
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import relative_attention
-
-
-def _create_mock_attention_data(
-    num_heads,
-    key_dim,
-    value_dim,
-    seq_length,
-    batch_size,
-    memory_length=0,
-    num_predictions=2,
-    two_stream=False,
-    include_state=False,
-    include_mask=False,
-    include_segment=False):
-  """Creates mock testing data.
-
-  Args:
-    num_heads: `int`, Number of attention heads.
-    key_dim: `int`, Size of query head.
-    value_dim: `int`, Size of key, value dim.
-    seq_length: `int`, Sequence length of the input.
-    batch_size: `int`, the batch size.
-    memory_length: optional `int`, the length of the state. Defaults to 0.
-    num_predictions: `int`, the number of predictions used in two stream
-      attention.
-    two_stream: `bool`, whether or not to generate two stream data.
-    include_state: optional `bool`, whether or not to include state data.
-    include_mask: optional `bool`, whether or not to include mask data.
-    include_segment: optional `bool`, whether or not to include segment data.
-
-  Returns:
-    A dictionary with `str` as keys and `Tensor` as values.
-  """
-  query_shape = (batch_size, seq_length, key_dim)
-  value_shape = (batch_size, seq_length, value_dim)
-  encoding_shape = (batch_size, seq_length * 2, key_dim)
-  attention_bias_shape = (num_heads, key_dim)
-
-  data = dict(
-      relative_position_encoding=tf.random.normal(shape=encoding_shape),
-      content_attention_bias=tf.random.normal(shape=attention_bias_shape),
-      positional_attention_bias=tf.random.normal(shape=attention_bias_shape))
-
-  if two_stream:
-    query_stream_shape = (batch_size, num_predictions, key_dim)
-    target_mapping_shape = (batch_size, num_predictions, seq_length)
-    stream_data = dict(
-        content_stream=tf.random.normal(shape=query_shape),
-        query_stream=tf.random.normal(shape=query_stream_shape),
-        target_mapping=tf.random.normal(shape=target_mapping_shape))
-  else:
-    stream_data = dict(
-        query=tf.random.normal(shape=query_shape),
-        value=tf.random.normal(shape=value_shape),
-        key=tf.random.normal(shape=value_shape))
-
-  data.update(stream_data)
-
-  if include_state:
-    total_seq_length = seq_length + memory_length
-    state_data = dict(
-        state=tf.random.normal(shape=(batch_size, memory_length, value_dim)))
-    data.update(state_data)
-  else:
-    total_seq_length = seq_length
-
-  if include_mask:
-    mask_shape = (batch_size, num_heads, seq_length, total_seq_length)
-    mask_data = np.random.randint(2, size=mask_shape).astype("float32")
-    if two_stream:
-      mask_data = dict(
-          content_attention_mask=mask_data,
-          query_attention_mask=mask_data)
-    else:
-      mask_data = dict(attention_mask=mask_data)
-    data.update(mask_data)
-
-  if include_segment:
-    segment_encoding_shape = (2, num_heads, key_dim)
-    segment_matrix = np.random.randint(
-        2, size=(batch_size, seq_length, total_seq_length))
-    segment_matrix = tf.math.equal(segment_matrix, 1)
-    segment_data = dict(
-        segment_attention_bias=tf.random.normal(shape=attention_bias_shape),
-        segment_encoding=tf.random.normal(shape=segment_encoding_shape),
-        segment_matrix=segment_matrix)
-    data.update(segment_data)
-
-  return data
-
-
-@keras_parameterized.run_all_keras_modes
-class MultiHeadRelativeAttentionTest(keras_parameterized.TestCase):
-
-  @combinations.generate(combinations.combine(
-      value_dim=[32, 64],
-      memory_length=[0, 4],
-      state=[True, False],
-      mask=[True, False],
-      segment=[True, False]))
-  def test_attention_scores(self,
-                            value_dim,
-                            memory_length,
-                            state,
-                            mask,
-                            segment):
-    """Tests combinations of attention score calculations."""
-    batch_size, num_heads, key_dim, seq_length = 2, 12, 64, 8
-    test_layer = relative_attention.MultiHeadRelativeAttention(
-        num_heads=num_heads,
-        key_dim=key_dim,
-        value_dim=value_dim)
-    data = _create_mock_attention_data(
-        num_heads=num_heads,
-        key_dim=key_dim,
-        value_dim=value_dim,
-        seq_length=seq_length,
-        memory_length=memory_length,
-        two_stream=False,
-        batch_size=batch_size,
-        include_state=state,
-        include_mask=mask,
-        include_segment=segment)
-    output = test_layer(**data)
-    self.assertEqual(output.shape, [batch_size, seq_length, key_dim])
-
-
-@keras_parameterized.run_all_keras_modes
-class TwoStreamRelativeAttentionTest(keras_parameterized.TestCase):
-
-  @combinations.generate(combinations.combine(
-      num_predictions=[2, 10],
-      memory_length=[0, 4],
-      state=[True, False],
-      mask=[True, False],
-      segment=[True, False]))
-  def test_attention_scores(self,
-                            num_predictions,
-                            memory_length,
-                            state,
-                            mask,
-                            segment):
-    """Tests combinations of attention score calculations."""
-    batch_size, num_heads, key_dim, seq_length = 2, 12, 64, 8
-    test_layer = relative_attention.TwoStreamRelativeAttention(
-        num_heads=num_heads,
-        key_dim=key_dim,
-        value_dim=key_dim)
-    data = _create_mock_attention_data(
-        num_heads=num_heads,
-        key_dim=key_dim,
-        value_dim=key_dim,
-        seq_length=seq_length,
-        memory_length=memory_length,
-        num_predictions=num_predictions,
-        two_stream=True,
-        batch_size=batch_size,
-        include_state=state,
-        include_mask=mask,
-        include_segment=segment)
-    content_output, query_output, = test_layer(**data)
-    self.assertEqual(content_output.shape, [batch_size, seq_length, key_dim])
-    self.assertEqual(query_output.shape, [batch_size, num_predictions, key_dim])
-
-
-if __name__ == "__main__":
-  np.random.seed(0)
-  tf.random.set_seed(0)
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/rezero_transformer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/rezero_transformer.py
deleted file mode 100644
index 334a8a8084cbdf5ad9def522f0b98ad022e9090b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/rezero_transformer.py
+++ /dev/null
@@ -1,249 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based rezero-transformer block layer (Transformer with ReZero)."""
-# pylint: disable=g-classes-have-attributes
-
-import gin
-import tensorflow as tf
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-@gin.configurable
-class ReZeroTransformer(tf.keras.layers.Layer):
-  """Transformer layer with ReZero.
-
-  This layer implements the Transformer from "Attention Is All You Need".
-  (https://arxiv.org/abs/1706.03762).
-  The residual connection implements the ReZero method.
-  (https://arxiv.org/abs/2003.04887)
-
-  Args:
-    num_attention_heads: Number of attention heads.
-    intermediate_size: Size of the intermediate layer.
-    intermediate_activation: Activation for the intermediate layer.
-    dropout_rate: Dropout probability for the post-attention and output dropout.
-    attention_dropout_rate: Dropout probability for within the attention layer.
-    output_range: the sequence output range, [0, output_range) by slicing the
-      target sequence. `None` means the target sequence is not sliced.
-    kernel_initializer: Initializer for dense layer kernels.
-    bias_initializer: Initializer for dense layer biases.
-    kernel_regularizer: Regularizer for dense layer kernels.
-    bias_regularizer: Regularizer for dense layer biases.
-    activity_regularizer: Regularizer for dense layer activity.
-    kernel_constraint: Constraint for dense layer kernels.
-    bias_constraint: Constraint for dense layer kernels.
-    use_layer_norm: If add layer_norm on top of the ReZero.
-  """
-
-  def __init__(self,
-               num_attention_heads,
-               intermediate_size,
-               intermediate_activation,
-               dropout_rate=0.0,
-               attention_dropout_rate=0.0,
-               output_range=None,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               use_layer_norm=False,
-               **kwargs):
-    super(ReZeroTransformer, self).__init__(**kwargs)
-
-    self._num_heads = num_attention_heads
-    self._intermediate_size = intermediate_size
-    self._intermediate_activation = intermediate_activation
-    self._attention_dropout_rate = attention_dropout_rate
-    self._dropout_rate = dropout_rate
-    self._output_range = output_range
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
-    self._use_layer_norm = use_layer_norm
-
-  def build(self, input_shape):
-    input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
-    input_tensor_shape = tf.TensorShape(input_tensor)
-    if len(input_tensor_shape.as_list()) != 3:
-      raise ValueError("TransformerLayer expects a three-dimensional input of "
-                       "shape [batch, sequence, width].")
-    batch_size, sequence_length, hidden_size = input_tensor_shape
-
-    if len(input_shape) == 2:
-      mask_tensor_shape = tf.TensorShape(input_shape[1])
-      expected_mask_tensor_shape = tf.TensorShape(
-          [batch_size, sequence_length, sequence_length])
-      if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
-        raise ValueError("When passing a mask tensor to TransformerLayer, the "
-                         "mask tensor must be of shape [batch, "
-                         "sequence_length, sequence_length] (here %s). Got a "
-                         "mask tensor of shape %s." %
-                         (expected_mask_tensor_shape, mask_tensor_shape))
-    if hidden_size % self._num_heads != 0:
-      raise ValueError(
-          "The input size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (hidden_size, self._num_heads))
-    self._attention_head_size = int(hidden_size // self._num_heads)
-    common_kwargs = dict(
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint)
-    self._attention_layer = tf.keras.layers.MultiHeadAttention(
-        num_heads=self._num_heads,
-        key_dim=self._attention_head_size,
-        dropout=self._attention_dropout_rate,
-        name="self_attention",
-        **common_kwargs)
-    self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-    if self._use_layer_norm:
-      # Use float32 in layernorm for numeric stability.
-      # It is probably safe in mixed_float16, but we haven't validated this yet.
-      self._attention_layer_norm = (
-          tf.keras.layers.LayerNormalization(
-              name="self_attention_layer_norm",
-              axis=-1,
-              epsilon=1e-12,
-              dtype=tf.float32))
-    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
-        "abc,cd->abd",
-        output_shape=(None, self._intermediate_size),
-        bias_axes="d",
-        name="intermediate",
-        **common_kwargs)
-    policy = tf.keras.mixed_precision.global_policy()
-    if policy.name == "mixed_bfloat16":
-      # bfloat16 causes BERT with the LAMB optimizer to not converge
-      # as well, so we use float32.
-      # TODO(b/154538392): Investigate this.
-      policy = tf.float32
-    self._intermediate_activation_layer = tf.keras.layers.Activation(
-        self._intermediate_activation, dtype=policy)
-    self._output_dense = tf.keras.layers.experimental.EinsumDense(
-        "abc,cd->abd",
-        output_shape=(None, hidden_size),
-        bias_axes="d",
-        name="output",
-        **common_kwargs)
-    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-    if self._use_layer_norm:
-      # Use float32 in layernorm for numeric stability.
-      self._output_layer_norm = tf.keras.layers.LayerNormalization(
-          name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)
-
-    self._rezero_a = self.add_weight(
-        name="rezero_alpha",
-        initializer=tf.keras.initializers.Zeros(),
-        trainable=True,
-        dtype=tf.float32)
-
-    super(ReZeroTransformer, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        "num_attention_heads":
-            self._num_heads,
-        "intermediate_size":
-            self._intermediate_size,
-        "intermediate_activation":
-            self._intermediate_activation,
-        "dropout_rate":
-            self._dropout_rate,
-        "attention_dropout_rate":
-            self._attention_dropout_rate,
-        "output_range":
-            self._output_range,
-        "use_layer_norm":
-            self._use_layer_norm,
-        "kernel_initializer":
-            tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            tf.keras.constraints.serialize(self._bias_constraint),
-    }
-    base_config = super(ReZeroTransformer, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def reset_rezero(self):
-    self._rezero_a.assign(0.)
-
-  def call(self, inputs):
-    if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
-      input_tensor, attention_mask = inputs
-    else:
-      input_tensor, attention_mask = (inputs, None)
-
-    if self._output_range:
-      target_tensor = input_tensor[:, 0:self._output_range, :]
-      attention_mask = attention_mask[:, 0:self._output_range, :]
-    else:
-      target_tensor = input_tensor
-
-    attention_output = self._attention_layer(
-        query=target_tensor, value=input_tensor, attention_mask=attention_mask)
-    attention_output = self._attention_dropout(attention_output)
-    attention_output = target_tensor + self._rezero_a * attention_output
-    if self._use_layer_norm:
-      attention_output = self._attention_layer_norm(attention_output)
-    else:
-      attention_output = tf.cast(attention_output, tf.float32)
-
-    intermediate_output = self._intermediate_dense(attention_output)
-    intermediate_output = self._intermediate_activation_layer(
-        intermediate_output)
-    layer_output = self._output_dense(intermediate_output)
-    layer_output = self._output_dropout(layer_output)
-    # During mixed precision training, attention_output is from layer norm and
-    # is always fp32 for now. Cast layer_output to fp32 for the subsequent add.
-    layer_output = attention_output + tf.cast(self._rezero_a * layer_output,
-                                              tf.float32)
-    if self._use_layer_norm:
-      layer_output = self._output_layer_norm(layer_output)
-
-    return layer_output
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/rezero_transformer_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/rezero_transformer_test.py
deleted file mode 100644
index 4da8f4a6882fb06ecee24c6d593dd7283eb7ad17..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/rezero_transformer_test.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for Keras-based rezero-transformer block layer."""
-
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import rezero_transformer
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class TransformerWithReZeroLayerTest(keras_parameterized.TestCase):
-
-  def tearDown(self):
-    super(TransformerWithReZeroLayerTest, self).tearDown()
-    tf.keras.mixed_precision.set_global_policy('float32')
-
-  def test_layer_invocation_with_float16_dtype(self):
-    tf.keras.mixed_precision.set_global_policy('mixed_float16')
-    test_layer = rezero_transformer.ReZeroTransformer(
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-    sequence_length = 21
-    width = 80
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    # Create a 2-dimensional input (the first dimension is implicit).
-    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
-    output_tensor = test_layer([data_tensor, mask_tensor])
-
-    # Create a model from the test layer.
-    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
-
-    # Invoke the model on test data. We can't validate the output data itself
-    # (the NN is too complex) but this will rule out structural runtime errors.
-    batch_size = 6
-    input_data = (10 * np.random.random_sample(
-        (batch_size, sequence_length, width)))
-    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
-    # which here is (batch, sequence_length, sequence_length)
-    mask_data = np.random.randint(
-        2, size=(batch_size, sequence_length, sequence_length))
-    _ = model.predict([input_data, mask_data])
-
-  def test_rezero_without_layer_norm(self):
-    test_layer = rezero_transformer.ReZeroTransformer(
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu',
-        use_layer_norm=False)
-
-    input_length, width = 16, 30
-    input_tensor = tf.keras.Input(shape=(input_length, width))
-    output_tensor = test_layer(input_tensor)
-    model = tf.keras.Model(input_tensor, output_tensor)
-
-    input_data = np.random.rand(2, input_length, width)
-    test_layer._rezero_a.assign(1.0)
-    test_layer.reset_rezero()
-    output_data = model.predict(input_data)
-
-    self.assertAllClose(input_data, output_data)
-
-  def test_rezero_with_layer_norm(self):
-    test_layer = rezero_transformer.ReZeroTransformer(
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu',
-        use_layer_norm=True)
-
-    input_length, width = 16, 30
-    input_tensor = tf.keras.Input(shape=(input_length, width))
-    output_tensor = test_layer(input_tensor)
-    model = tf.keras.Model(input_tensor, output_tensor)
-
-    input_data = np.random.rand(2, input_length, width) + 2.0
-    output_data = model.predict(input_data)
-    input_data_normed = (input_data -
-                         np.mean(input_data, axis=-1, keepdims=True)) / (
-                             np.std(input_data, axis=-1, keepdims=True))
-
-    self.assertAllClose(input_data_normed, output_data)
-
-  def test_layer_output_range(self):
-    test_layer = rezero_transformer.ReZeroTransformer(
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-    sequence_length = 21
-    width = 80
-
-    batch_size = 6
-    input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, width))
-    mask_data = np.random.randint(
-        2, size=(batch_size, sequence_length, sequence_length))
-    output_tensor = test_layer([input_data, mask_data])
-
-    # The layer only attends to the first token and outputs the first token
-    # embeeding.
-    new_layer = rezero_transformer.ReZeroTransformer(
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu',
-        output_range=1)
-    _ = new_layer([input_data, mask_data])
-    new_layer.set_weights(test_layer.get_weights())
-    new_output_tensor = new_layer([input_data, mask_data])
-    self.assertAllClose(new_output_tensor, output_tensor[:, 0:1, :])
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/self_attention_mask.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/self_attention_mask.py
deleted file mode 100644
index 3537d28fd8dbe7f3c4255e29c1760aac1660b507..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/self_attention_mask.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras layer that creates a self-attention mask."""
-
-import tensorflow as tf
-
-from official.nlp.keras_nlp import layers
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class SelfAttentionMask(layers.SelfAttentionMask):
-  """Creates 3D attention mask from a 2D tensor mask.
-
-    **Warning: Please use the `keras_nlp.layers.SelfAttentionMask`.**
-    inputs[0]: from_tensor: 2D or 3D Tensor of shape
-      `(batch_size, from_seq_length, ...)`.
-    inputs[1]: to_mask: int32 Tensor of shape `(batch_size, to_seq_length)`.
-
-    Returns:
-      Float Tensor of shape `(batch_size, from_seq_length, to_seq_length)`.
-  """
-
-  def call(self, inputs):
-    if isinstance(inputs, list):
-      return super().call(inputs[0], inputs[1])
-    else:
-      return super().call(inputs)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/spectral_normalization.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/spectral_normalization.py
deleted file mode 100644
index 18229c9085a9ca73bea4e0e98cb10b007dbb40cd..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/spectral_normalization.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Normalization layers.
-
-## References:
-
-[1] Yuichi Yoshida, Takeru Miyato. Spectral Norm Regularization for Improving
-    the Generalizability of Deep Learning.
-    _arXiv preprint arXiv:1705.10941_, 2017. https://arxiv.org/abs/1705.10941
-
-[2] Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida.
-    Spectral normalization for generative adversarial networks.
-    In _International Conference on Learning Representations_, 2018.
-
-[3] Henry Gouk, Eibe Frank, Bernhard Pfahringer, Michael Cree.
-    Regularisation of neural networks by enforcing lipschitz continuity.
-    _arXiv preprint arXiv:1804.04368_, 2018. https://arxiv.org/abs/1804.04368
-"""
-
-import numpy as np
-import tensorflow as tf
-
-
-class SpectralNormalization(tf.keras.layers.Wrapper):
-  """Implements spectral normalization for Dense layer."""
-
-  def __init__(self,
-               layer,
-               iteration=1,
-               norm_multiplier=0.95,
-               training=True,
-               aggregation=tf.VariableAggregation.MEAN,
-               inhere_layer_name=False,
-               **kwargs):
-    """Initializer.
-
-    Args:
-      layer: (tf.keras.layers.Layer) A TF Keras layer to apply normalization to.
-      iteration: (int) The number of power iteration to perform to estimate
-        weight matrix's singular value.
-      norm_multiplier: (float) Multiplicative constant to threshold the
-        normalization. Usually under normalization, the singular value will
-        converge to this value.
-      training: (bool) Whether to perform power iteration to update the singular
-        value estimate.
-      aggregation: (tf.VariableAggregation) Indicates how a distributed variable
-        will be aggregated. Accepted values are constants defined in the class
-        tf.VariableAggregation.
-      inhere_layer_name: (bool) Whether to inhere the name of the input layer.
-      **kwargs: (dict) Other keyword arguments for the layers.Wrapper class.
-    """
-    self.iteration = iteration
-    self.do_power_iteration = training
-    self.aggregation = aggregation
-    self.norm_multiplier = norm_multiplier
-
-    # Set layer name.
-    wrapper_name = kwargs.pop('name', None)
-    if inhere_layer_name:
-      wrapper_name = layer.name
-
-    if not isinstance(layer, tf.keras.layers.Layer):
-      raise ValueError('`layer` must be a `tf.keras.layer.Layer`. '
-                       'Observed `{}`'.format(layer))
-    super(SpectralNormalization, self).__init__(
-        layer, name=wrapper_name, **kwargs)
-
-  def build(self, input_shape):
-    super(SpectralNormalization, self).build(input_shape)
-    self.layer.kernel._aggregation = self.aggregation  # pylint: disable=protected-access
-    self._dtype = self.layer.kernel.dtype
-
-    self.w = self.layer.kernel
-    self.w_shape = self.w.shape.as_list()
-    self.uv_initializer = tf.initializers.random_normal()
-
-    self.v = self.add_weight(
-        shape=(1, np.prod(self.w_shape[:-1])),
-        initializer=self.uv_initializer,
-        trainable=False,
-        name='v',
-        dtype=self.dtype,
-        aggregation=self.aggregation)
-
-    self.u = self.add_weight(
-        shape=(1, self.w_shape[-1]),
-        initializer=self.uv_initializer,
-        trainable=False,
-        name='u',
-        dtype=self.dtype,
-        aggregation=self.aggregation)
-
-    self.update_weights()
-
-  def call(self, inputs, *, training=None):
-    training = self.do_power_iteration if training is None else training
-    u_update_op, v_update_op, w_update_op = self.update_weights(
-        training=training)
-    output = self.layer(inputs)
-    w_restore_op = self.restore_weights()
-
-    # Register update ops.
-    self.add_update(u_update_op)
-    self.add_update(v_update_op)
-    self.add_update(w_update_op)
-    self.add_update(w_restore_op)
-
-    return output
-
-  def update_weights(self, *, training=True):
-    w_reshaped = tf.reshape(self.w, [-1, self.w_shape[-1]])
-
-    u_hat = self.u
-    v_hat = self.v
-
-    if training:
-      for _ in range(self.iteration):
-        v_hat = tf.nn.l2_normalize(tf.matmul(u_hat, tf.transpose(w_reshaped)))
-        u_hat = tf.nn.l2_normalize(tf.matmul(v_hat, w_reshaped))
-
-    sigma = tf.matmul(tf.matmul(v_hat, w_reshaped), tf.transpose(u_hat))
-    # Convert sigma from a 1x1 matrix to a scalar.
-    sigma = tf.reshape(sigma, [])
-    u_update_op = self.u.assign(u_hat)
-    v_update_op = self.v.assign(v_hat)
-
-    # Bound spectral norm to be not larger than self.norm_multiplier.
-    w_norm = tf.cond((self.norm_multiplier / sigma) < 1, lambda:  # pylint:disable=g-long-lambda
-                     (self.norm_multiplier / sigma) * self.w, lambda: self.w)
-
-    w_update_op = self.layer.kernel.assign(w_norm)
-    return u_update_op, v_update_op, w_update_op
-
-  def restore_weights(self):
-    """Restores layer weights to maintain gradient update (See Alg 1 of [1])."""
-    return self.layer.kernel.assign(self.w)
-
-
-class SpectralNormalizationConv2D(tf.keras.layers.Wrapper):
-  """Implements spectral normalization for Conv2D layer based on [3]."""
-
-  def __init__(self,
-               layer,
-               iteration=1,
-               norm_multiplier=0.95,
-               training=True,
-               aggregation=tf.VariableAggregation.MEAN,
-               legacy_mode=False,
-               **kwargs):
-    """Initializer.
-
-    Args:
-      layer: (tf.keras.layers.Layer) A TF Keras layer to apply normalization to.
-      iteration: (int) The number of power iteration to perform to estimate
-        weight matrix's singular value.
-      norm_multiplier: (float) Multiplicative constant to threshold the
-        normalization. Usually under normalization, the singular value will
-        converge to this value.
-      training: (bool) Whether to perform power iteration to update the singular
-        value estimate.
-      aggregation: (tf.VariableAggregation) Indicates how a distributed variable
-        will be aggregated. Accepted values are constants defined in the class
-        tf.VariableAggregation.
-      legacy_mode: (bool) Whether to use the legacy implementation where the
-        dimension of the u and v vectors are set to the batch size. It should
-        not be enabled unless for backward compatibility reasons.
-      **kwargs: (dict) Other keyword arguments for the layers.Wrapper class.
-    """
-    self.iteration = iteration
-    self.do_power_iteration = training
-    self.aggregation = aggregation
-    self.norm_multiplier = norm_multiplier
-    self.legacy_mode = legacy_mode
-
-    # Set layer attributes.
-    layer._name += '_spec_norm'
-
-    if not isinstance(layer, tf.keras.layers.Conv2D):
-      raise ValueError(
-          'layer must be a `tf.keras.layer.Conv2D` instance. You passed: {input}'
-          .format(input=layer))
-    super(SpectralNormalizationConv2D, self).__init__(layer, **kwargs)
-
-  def build(self, input_shape):
-    self.layer.build(input_shape)
-    self.layer.kernel._aggregation = self.aggregation  # pylint: disable=protected-access
-    self._dtype = self.layer.kernel.dtype
-
-    # Shape (kernel_size_1, kernel_size_2, in_channel, out_channel).
-    self.w = self.layer.kernel
-    self.w_shape = self.w.shape.as_list()
-    self.strides = self.layer.strides
-
-    # Set the dimensions of u and v vectors.
-    batch_size = input_shape[0]
-    uv_dim = batch_size if self.legacy_mode else 1
-
-    # Resolve shapes.
-    in_height = input_shape[1]
-    in_width = input_shape[2]
-    in_channel = self.w_shape[2]
-
-    out_height = in_height // self.strides[0]
-    out_width = in_width // self.strides[1]
-    out_channel = self.w_shape[3]
-
-    self.in_shape = (uv_dim, in_height, in_width, in_channel)
-    self.out_shape = (uv_dim, out_height, out_width, out_channel)
-    self.uv_initializer = tf.initializers.random_normal()
-
-    self.v = self.add_weight(
-        shape=self.in_shape,
-        initializer=self.uv_initializer,
-        trainable=False,
-        name='v',
-        dtype=self.dtype,
-        aggregation=self.aggregation)
-
-    self.u = self.add_weight(
-        shape=self.out_shape,
-        initializer=self.uv_initializer,
-        trainable=False,
-        name='u',
-        dtype=self.dtype,
-        aggregation=self.aggregation)
-
-    super(SpectralNormalizationConv2D, self).build()
-
-  def call(self, inputs):
-    u_update_op, v_update_op, w_update_op = self.update_weights()
-    output = self.layer(inputs)
-    w_restore_op = self.restore_weights()
-
-    # Register update ops.
-    self.add_update(u_update_op)
-    self.add_update(v_update_op)
-    self.add_update(w_update_op)
-    self.add_update(w_restore_op)
-
-    return output
-
-  def update_weights(self):
-    """Computes power iteration for convolutional filters based on [3]."""
-    # Initialize u, v vectors.
-    u_hat = self.u
-    v_hat = self.v
-
-    if self.do_power_iteration:
-      for _ in range(self.iteration):
-        # Updates v.
-        v_ = tf.nn.conv2d_transpose(
-            u_hat,
-            self.w,
-            output_shape=self.in_shape,
-            strides=self.strides,
-            padding='SAME')
-        v_hat = tf.nn.l2_normalize(tf.reshape(v_, [1, -1]))
-        v_hat = tf.reshape(v_hat, v_.shape)
-
-        # Updates u.
-        u_ = tf.nn.conv2d(v_hat, self.w, strides=self.strides, padding='SAME')
-        u_hat = tf.nn.l2_normalize(tf.reshape(u_, [1, -1]))
-        u_hat = tf.reshape(u_hat, u_.shape)
-
-    v_w_hat = tf.nn.conv2d(v_hat, self.w, strides=self.strides, padding='SAME')
-
-    sigma = tf.matmul(tf.reshape(v_w_hat, [1, -1]), tf.reshape(u_hat, [-1, 1]))
-    # Convert sigma from a 1x1 matrix to a scalar.
-    sigma = tf.reshape(sigma, [])
-
-    u_update_op = self.u.assign(u_hat)
-    v_update_op = self.v.assign(v_hat)
-
-    w_norm = tf.cond((self.norm_multiplier / sigma) < 1, lambda:      # pylint:disable=g-long-lambda
-                     (self.norm_multiplier / sigma) * self.w, lambda: self.w)
-
-    w_update_op = self.layer.kernel.assign(w_norm)
-
-    return u_update_op, v_update_op, w_update_op
-
-  def restore_weights(self):
-    """Restores layer weights to maintain gradient update (See Alg 1 of [1])."""
-    return self.layer.kernel.assign(self.w)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/spectral_normalization_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/spectral_normalization_test.py
deleted file mode 100644
index 2f1406f13317945d46db6aa48516eb84b698b329..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/spectral_normalization_test.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for normalization layers.
-
-## References:
-
-[1] Hanie Sedghi, Vineet Gupta, Philip M. Long.
-    The Singular Values of Convolutional Layers.
-    In _International Conference on Learning Representations_, 2019.
-"""
-from absl.testing import parameterized
-
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.modeling.layers import spectral_normalization
-
-DenseLayer = tf.keras.layers.Dense(10)
-Conv2DLayer = tf.keras.layers.Conv2D(filters=64, kernel_size=3, padding='valid')
-
-
-def _compute_spectral_norm(weight):
-  if weight.ndim > 2:
-    # Computes Conv2D via FFT transform as in [1].
-    weight = np.fft.fft2(weight, weight.shape[1:3], axes=[0, 1])
-  return np.max(np.linalg.svd(weight, compute_uv=False))
-
-
-class NormalizationTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(NormalizationTest, self).setUp()
-    self.num_iterations = 1000
-    self.norm_multiplier = 0.95
-
-  @parameterized.named_parameters(
-      ('Dense',
-       (None, 10), DenseLayer, spectral_normalization.SpectralNormalization),
-      ('Conv2D', (None, 32, 32, 3), Conv2DLayer,
-       spectral_normalization.SpectralNormalizationConv2D))
-  def test_spec_norm_magnitude(self, input_shape, layer, norm_wrapper):
-    """Tests if the weights spectral norm converges to norm_multiplier."""
-    layer.build(input_shape)
-    sn_layer = norm_wrapper(
-        layer,
-        iteration=self.num_iterations,
-        norm_multiplier=self.norm_multiplier)
-
-    # Perform normalization.
-    sn_layer.build(input_shape)
-    sn_layer.update_weights()
-    normalized_kernel = sn_layer.layer.kernel.numpy()
-
-    spectral_norm_computed = _compute_spectral_norm(normalized_kernel)
-    spectral_norm_expected = self.norm_multiplier
-    self.assertAllClose(
-        spectral_norm_computed, spectral_norm_expected, atol=5e-2)
-
-    # Test that the normalized layer is K-Lipschitz. In particular, if the layer
-    # is a function f, then ||f(x1) - f(x2)||_2 <= K * ||(x1 - x2)||_2, where K
-    # is the norm multiplier.
-    new_input_shape = (16,) + input_shape[1:]
-    new_input = tf.random.uniform(new_input_shape)
-    delta_vec = tf.random.uniform(new_input_shape)
-    output1 = sn_layer(new_input)
-    output2 = sn_layer(new_input + delta_vec)
-
-    delta_input = tf.norm(tf.reshape(delta_vec, (-1,))).numpy()
-    delta_output = tf.norm(tf.reshape(output2 - output1, (-1,))).numpy()
-    self.assertLessEqual(delta_output, self.norm_multiplier * delta_input)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/talking_heads_attention.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/talking_heads_attention.py
deleted file mode 100644
index 5007e7cd9832826eb2c31305652470d76da763d3..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/talking_heads_attention.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Talking Head Attention layer."""
-# pylint: disable=g-classes-have-attributes
-import math
-import string
-
-import gin
-import tensorflow as tf
-
-_CHR_IDX = string.ascii_lowercase
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-@gin.configurable
-class TalkingHeadsAttention(tf.keras.layers.MultiHeadAttention):
-  """Implements Talking-Heads Attention.
-
-  This is an implementation of Talking-Heads Attention based on the paper
-  Talking-Heads Attention (https://arxiv.org/abs/2003.02436): it enhanced
-  multi-head attention by including linearprojections across the attention-heads
-  dimension, immediately before and after the softmax operation.
-
-  See the base class `tf.keras.layers.MultiHeadAttention` for more details.
-
-  Args:
-    num_heads: Number of attention heads.
-    key_dim: Size of each attention head for query and key.
-    value_dim:  Size of each attention head for value.
-    dropout: Dropout probability.
-    use_bias: Boolean, whether the dense layers use bias vectors/matrices.
-    output_shape: The expected shape of an output tensor, besides the batch and
-      sequence dims. If not specified, projects back to the key feature dim.
-    attention_axes: axes over which the attention is applied. `None` means
-      attention over all axes, but batch, heads, and features.
-    return_attention_scores: bool, if `True`, returns the multi-head attention
-      scores as an additional output argument.
-    kernel_initializer: Initializer for dense layer kernels.
-    bias_initializer: Initializer for dense layer biases.
-    kernel_regularizer: Regularizer for dense layer kernels.
-    bias_regularizer: Regularizer for dense layer biases.
-    activity_regularizer: Regularizer for dense layer activity.
-    kernel_constraint: Constraint for dense layer kernels.
-    bias_constraint: Constraint for dense layer kernels.
-  """
-
-  def _build_attention(self, qkv_rank):
-    """Builds multi-head dot-product attention computations.
-
-    This function overrides base class to create additional linear projection
-    that will be applied on attention scores before and after softmax.
-
-    Args:
-      qkv_rank: The rank of query, key, value tensors after projection.
-    """
-    super(TalkingHeadsAttention, self)._build_attention(qkv_rank)
-
-    # Build an equation:
-    # (<batch_dims>, num_heads_a, ...),(num_heads_a, num_heads_b) ->
-    # (<batch_dims>, num_heads_b, ...)
-    # qkv_ranks has `batch_dims`, `attention_dims`, `num_heads` and `channels`.
-    num_batch_dims = qkv_rank - len(self._attention_axes) - 2
-
-    # The shape of attn_scores is:
-    # (<batch_dims>, num_heads, <query_attn_dims>, <key_attn_dims>)
-    attn_scores_rank = num_batch_dims + 1 + len(self._attention_axes) * 2
-    scores_notation = _CHR_IDX[:attn_scores_rank]
-    projection_notation = scores_notation[num_batch_dims] + (
-        _CHR_IDX[attn_scores_rank])
-    projected_scores_notation = scores_notation[:num_batch_dims] + (
-        _CHR_IDX[attn_scores_rank] + scores_notation[num_batch_dims + 1:])
-    self._talking_heads_equation = "%s,%s->%s" % (
-        scores_notation, projection_notation, projected_scores_notation)
-
-    self._pre_softmax_weight = self.add_weight(
-        "pre_softmax_weight",
-        shape=(self._num_heads, self._num_heads),
-        initializer=self._kernel_initializer,
-        regularizer=self._kernel_regularizer,
-        constraint=self._kernel_constraint,
-        dtype=self.dtype,
-        trainable=True)
-    self._post_softmax_weight = self.add_weight(
-        "post_softmax_weight",
-        shape=(self._num_heads, self._num_heads),
-        initializer=self._kernel_initializer,
-        regularizer=self._kernel_regularizer,
-        constraint=self._kernel_constraint,
-        dtype=self.dtype,
-        trainable=True)
-
-  def _compute_attention(self,
-                         query_tensor,
-                         key_tensor,
-                         value_tensor,
-                         attention_mask=None,
-                         training=None):
-    """Applies Dot-product attention with query, key, value tensors.
-
-    This function overrides base class to apply additional linear projection
-    on attention scores before and after softmax.
-
-    Args:
-      query_tensor: Projected query `Tensor` of shape `[B, T, N, key_dim]`.
-      key_tensor: Projected key `Tensor` of shape `[B, T, N, key_dim]`.
-      value_tensor: Projected value `Tensor` of shape `[B, T, N, value_dim]`.
-      attention_mask: a boolean mask of shape `[B, T, S]`, that prevents
-        attention to certain positions.
-      training: Python boolean indicating whether the layer should behave in
-        training mode (adding dropout) or in inference mode (doing nothing).
-
-    Returns:
-      attention_output: Multi-headed outputs of attention computation.
-      attention_scores: Multi-headed attention weights.
-    """
-    # Take the dot product between "query" and "key" to get the raw
-    # attention scores.
-    attention_scores = tf.einsum(self._dot_product_equation, key_tensor,
-                                 query_tensor)
-    attention_scores = tf.multiply(attention_scores,
-                                   1.0 / math.sqrt(float(self._key_dim)))
-
-    # Apply linear projection before softmax
-    attention_scores = tf.einsum(self._talking_heads_equation, attention_scores,
-                                 self._pre_softmax_weight)
-
-    # Normalize the attention scores to probabilities.
-    # `attention_scores` = [B, N, T, S]
-    attention_scores = self._masked_softmax(attention_scores, attention_mask)
-
-    # Apply linear projection after softmax
-    attention_scores = tf.einsum(self._talking_heads_equation, attention_scores,
-                                 self._post_softmax_weight)
-
-    # This is actually dropping out entire tokens to attend to, which might
-    # seem a bit unusual, but is taken from the original Transformer paper.
-    attention_scores_dropout = self._dropout_layer(
-        attention_scores, training=training)
-
-    # `context_layer` = [B, T, N, H]
-    attention_output = tf.einsum(self._combine_equation,
-                                 attention_scores_dropout, value_tensor)
-    return attention_output, attention_scores
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/talking_heads_attention_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/talking_heads_attention_test.py
deleted file mode 100644
index 92f9cfdbe6d93b47715662a5d3851f922c57e1ad..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/talking_heads_attention_test.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for the attention layer."""
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import talking_heads_attention
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-# This test is revised base on attention.MultiHeadAttentionTest.
-@keras_parameterized.run_all_keras_modes
-class TalkingHeadsAttentionTest(keras_parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ("key_value_same_proj", None, None, [40, 80]),
-      ("key_value_different_proj", 32, 60, [40, 60]),
-  )
-  def test_non_masked_attention(self, value_dim, output_shape, output_dims):
-    """Test that the attention layer can be created without a mask tensor."""
-    test_layer = talking_heads_attention.TalkingHeadsAttention(
-        num_heads=12,
-        key_dim=64,
-        value_dim=value_dim,
-        output_shape=output_shape)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = tf.keras.Input(shape=(40, 80))
-    value = tf.keras.Input(shape=(20, 80))
-    output = test_layer(query=query, value=value)
-    self.assertEqual(output.shape.as_list(), [None] + output_dims)
-
-  def test_non_masked_self_attention(self):
-    """Test with one input (self-attenntion) and no mask tensor."""
-    test_layer = talking_heads_attention.TalkingHeadsAttention(
-        num_heads=12, key_dim=64)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = tf.keras.Input(shape=(40, 80))
-    output = test_layer(query=query, value=query)
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
-
-  def test_attention_scores(self):
-    """Test attention outputs with coefficients."""
-    test_layer = talking_heads_attention.TalkingHeadsAttention(
-        num_heads=12, key_dim=64)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = tf.keras.Input(shape=(40, 80))
-    output, coef = test_layer(query=query, value=query,
-                              return_attention_scores=True)
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
-    self.assertEqual(coef.shape.as_list(), [None, 12, 40, 40])
-
-  @parameterized.named_parameters(("with_bias", True), ("no_bias", False))
-  def test_masked_attention(self, use_bias):
-    """Test with a mask tensor."""
-    test_layer = talking_heads_attention.TalkingHeadsAttention(
-        num_heads=12, key_dim=2, use_bias=use_bias)
-    # Create a 3-dimensional input (the first dimension is implicit).
-    batch_size = 3
-    query = tf.keras.Input(shape=(4, 8))
-    value = tf.keras.Input(shape=(2, 8))
-    mask_tensor = tf.keras.Input(shape=(4, 2))
-    output = test_layer(query=query, value=value, attention_mask=mask_tensor)
-
-    # Create a model containing the test layer.
-    model = tf.keras.Model([query, value, mask_tensor], output)
-
-    # Generate data for the input (non-mask) tensors.
-    from_data = 10 * np.random.random_sample((batch_size, 4, 8))
-    to_data = 10 * np.random.random_sample((batch_size, 2, 8))
-
-    # Invoke the data with a random set of mask data. This should mask at least
-    # one element.
-    mask_data = np.random.randint(2, size=(batch_size, 4, 2))
-    masked_output_data = model.predict([from_data, to_data, mask_data])
-
-    # Invoke the same data, but with a null mask (where no elements are masked).
-    null_mask_data = np.ones((batch_size, 4, 2))
-    unmasked_output_data = model.predict([from_data, to_data, null_mask_data])
-
-    # Because one data is masked and one is not, the outputs should not be the
-    # same.
-    self.assertNotAllClose(masked_output_data, unmasked_output_data)
-
-    # Tests the layer with three inputs: Q, K, V.
-    key = tf.keras.Input(shape=(2, 8))
-    output = test_layer(
-        query=query, value=value, key=key, attention_mask=mask_tensor)
-    model = tf.keras.Model([query, value, key, mask_tensor], output)
-
-    masked_output_data = model.predict([from_data, to_data, to_data, mask_data])
-    unmasked_output_data = model.predict(
-        [from_data, to_data, to_data, null_mask_data])
-    # Because one data is masked and one is not, the outputs should not be the
-    # same.
-    self.assertNotAllClose(masked_output_data, unmasked_output_data)
-
-    if use_bias:
-      self.assertLen(test_layer._query_dense.trainable_variables, 2)
-      self.assertLen(test_layer._output_dense.trainable_variables, 2)
-    else:
-      self.assertLen(test_layer._query_dense.trainable_variables, 1)
-      self.assertLen(test_layer._output_dense.trainable_variables, 1)
-
-  def test_initializer(self):
-    """Test with a specified initializer."""
-    test_layer = talking_heads_attention.TalkingHeadsAttention(
-        num_heads=12,
-        key_dim=64,
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
-    # Create a 3-dimensional input (the first dimension is implicit).
-    query = tf.keras.Input(shape=(40, 80))
-    output = test_layer(query=query, value=query)
-    self.assertEqual(output.shape.as_list(), [None, 40, 80])
-
-  @parameterized.named_parameters(
-      ("4d_inputs_one_free_batch", [3, 4], [3, 2], [4, 2], (2,)),
-      ("4D_inputs_2D_attention", [3, 4], [3, 2], [3, 4, 3, 2], (1, 2)),
-      ("5D_inputs_2D_attention", [5, 3, 4], [5, 3, 2], [3, 4, 3, 2], (2, 3)))
-  def test_high_dim_attention(self, q_dims, v_dims, mask_dims, attention_axes):
-    """Test with a mask tensor."""
-    test_layer = talking_heads_attention.TalkingHeadsAttention(
-        num_heads=12, key_dim=2, attention_axes=attention_axes)
-    batch_size, hidden_size = 3, 8
-    # Generate data for the input (non-mask) tensors.
-    query_shape = [batch_size] + q_dims + [hidden_size]
-    value_shape = [batch_size] + v_dims + [hidden_size]
-    mask_shape = [batch_size] + mask_dims
-    query = 10 * np.random.random_sample(query_shape)
-    value = 10 * np.random.random_sample(value_shape)
-
-    # Invoke the data with a random set of mask data. This should mask at least
-    # one element.
-    mask_data = np.random.randint(2, size=mask_shape).astype("bool")
-    output = test_layer(query=query, value=value, attention_mask=mask_data)
-
-    # Invoke the same data, but with a null mask (where no elements are masked).
-    null_mask_data = np.ones(mask_shape)
-    unmasked_output = test_layer(
-        query=query, value=value, attention_mask=null_mask_data)
-    # Because one data is masked and one is not, the outputs should not be the
-    # same.
-    self.assertNotAllClose(output, unmasked_output)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/text_layers.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/text_layers.py
deleted file mode 100644
index 8cf1d42de7ae594b9fe0abf15aa970945c2e0166..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/text_layers.py
+++ /dev/null
@@ -1,720 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras Layers for BERT-specific preprocessing."""
-from typing import Any, Dict, List, Optional, Union
-
-from absl import logging
-import tensorflow as tf
-
-try:
-  import tensorflow_text as text  # pylint: disable=g-import-not-at-top
-except ImportError:
-  text = None
-except tf.errors.NotFoundError as e:
-  logging.warn("Encountered error when importing tensorflow_text: %s", e)
-  text = None
-
-
-def _check_if_tf_text_installed():
-  if text is None:
-    raise ImportError("import tensorflow_text failed, please install "
-                      "'tensorflow-text-nightly'.")
-
-
-def _iterative_vectorized_fair_share(capacity: tf.Tensor,
-                                     limit: Union[int, tf.Tensor]):
-  """Iterative algorithm for max min fairness algorithm.
-
-  Reference: https://en.wikipedia.org/wiki/Max-min_fairness
-
-  The idea is for each example with some number of segments and a limit of
-  total segment length allowed, we grant each segment a fair share of the
-  limit. For example, if every segment has the same length, no work to do.
-  If one segment has below average length, its share will be spilt to others
-  fairly. In this way, the longest segment will be the shortest among all
-  potential capacity assignments.
-
-  Args:
-    capacity: A rank-2 Tensor of #Segments x Batch.
-    limit: The largest permissible number of tokens in total across one example.
-
-  Returns:
-    A rank-2 Tensor with new segment capacity assignment such that
-      the total number of tokens in each example does not exceed the `limit`.
-  """
-  # Firstly, we calculate the lower bound of the capacity assignment.
-  per_seg_limit = limit // capacity.shape[0]
-  limit_mask = tf.ones(capacity.shape, dtype=tf.int64) * per_seg_limit
-  lower_bound = tf.minimum(capacity, limit_mask)
-
-  # This step makes up the capacity that already statisfy the capacity limit.
-  remaining_cap_sum = limit - tf.math.reduce_sum(lower_bound, axis=0)
-  remaining_cap_mat = capacity - lower_bound
-  new_cap = lower_bound + remaining_cap_mat * tf.cast(
-      tf.math.reduce_sum(remaining_cap_mat, axis=0) <= remaining_cap_sum,
-      tf.int64)
-
-  # Process iteratively. This step is O(#segments), see analysis below.
-  while True:
-    remaining_limit = limit - tf.math.reduce_sum(new_cap, axis=0)
-    remaining_cap = capacity - new_cap
-    masked_remaining_slots = tf.cast(remaining_cap > 0, tf.int64)
-    remaining_cap_col_slots = tf.reduce_sum(masked_remaining_slots, axis=0)
-    masked_remaining_limit = tf.cast(remaining_cap_col_slots > 0,
-                                     tf.int64) * remaining_limit
-    # Total remaining segment limit is different for each example.
-    per_seg_limit = masked_remaining_limit // (
-        tf.cast(remaining_cap_col_slots <= 0, tf.int64) +
-        remaining_cap_col_slots)  # +1 to make sure 0/0 = 0
-
-    # Note that for each step, there is at least one more segment being
-    # fulfilled or the loop is finished.
-    # The idea is, if remaining per example limit > smallest among segments,
-    # the smallest segment ask is fullfilled. Otherwise, all remaining segments
-    # are truncated, the assignment is finished.
-    if tf.math.reduce_sum(per_seg_limit) > 0:
-      remaining_slots_mat = tf.cast(remaining_cap > 0, tf.int64)
-      new_cap = new_cap + remaining_slots_mat * per_seg_limit
-    else:
-      # Leftover assignment of limit that is smaller than #slots.
-      new_remained_assignment_mask = tf.cast(
-          (tf.cumsum(masked_remaining_slots, axis=0) <= masked_remaining_limit)
-          & (masked_remaining_slots > 0), tf.int64)
-      new_cap = new_cap + new_remained_assignment_mask
-      break
-  return new_cap
-
-
-def round_robin_truncate_inputs(
-    inputs: Union[tf.RaggedTensor, List[tf.RaggedTensor]],
-    limit: Union[int, tf.Tensor],
-) -> Union[tf.RaggedTensor, List[tf.RaggedTensor]]:
-  """Truncates a list of batched segments to fit a per-example length limit.
-
-  Available space is assigned one token at a time in a round-robin fashion
-  to the inputs that still need some, until the limit is reached.
-  (Or equivalently: the longest input is truncated by one token until the total
-  length of inputs fits the limit.) Examples that fit the limit as passed in
-  remain unchanged.
-
-  Args:
-    inputs: A list of rank-2 RaggedTensors. The i-th example is given by
-      the i-th row in each list element, that is, `inputs[:][i, :]`.
-    limit: The largest permissible number of tokens in total across one example.
-
-  Returns:
-    A list of rank-2 RaggedTensors at corresponding indices with the inputs,
-      in which the rows of each RaggedTensor have been truncated such that
-      the total number of tokens in each example does not exceed the `limit`.
-  """
-  if not isinstance(inputs, (list, tuple)):
-    return round_robin_truncate_inputs([inputs], limit)[0]
-  limit = tf.cast(limit, tf.int64)
-  if not all(rt.shape.rank == 2 for rt in inputs):
-    raise ValueError("All inputs must have shape [batch_size, (items)]")
-  if len(inputs) == 1:
-    return [_truncate_row_lengths(inputs[0], limit)]
-  elif len(inputs) == 2:
-    size_a, size_b = [rt.row_lengths() for rt in inputs]
-    # Here's a brain-twister: This does round-robin assignment of quota
-    # to both inputs until the limit is reached. Hint: consider separately
-    # the cases of zero, one, or two inputs exceeding half the limit.
-    floor_half = limit // 2
-    ceil_half = limit - floor_half
-    quota_a = tf.minimum(size_a, ceil_half + tf.nn.relu(floor_half - size_b))
-    quota_b = tf.minimum(size_b, floor_half + tf.nn.relu(ceil_half - size_a))
-    return [_truncate_row_lengths(inputs[0], quota_a),
-            _truncate_row_lengths(inputs[1], quota_b)]
-  else:
-    # Note that we don't merge with the 2 input case because the full algorithm
-    # is more expensive.
-    capacity = tf.stack([rt.row_lengths() for rt in inputs])  # #Segments x B
-    new_capacity = _iterative_vectorized_fair_share(capacity, limit)
-    return [
-        _truncate_row_lengths(inputs[i], new_capacity[i])
-        for i in range(capacity.shape[0])
-    ]
-
-
-def _truncate_row_lengths(ragged_tensor: tf.RaggedTensor,
-                          new_lengths: tf.Tensor) -> tf.RaggedTensor:
-  """Truncates the rows of `ragged_tensor` to the given row lengths."""
-  new_lengths = tf.broadcast_to(new_lengths,
-                                ragged_tensor.bounding_shape()[0:1])
-  def fn(x):
-    row, new_length = x
-    return row[0:new_length]
-  fn_dtype = tf.RaggedTensorSpec(dtype=ragged_tensor.dtype,
-                                 ragged_rank=ragged_tensor.ragged_rank - 1)
-  result = tf.map_fn(fn, (ragged_tensor, new_lengths), dtype=fn_dtype)
-  # Work around broken shape propagation: without this, result has unknown rank.
-  flat_values_shape = [None] * ragged_tensor.flat_values.shape.rank
-  result = result.with_flat_values(
-      tf.ensure_shape(result.flat_values, flat_values_shape))
-
-  return result
-
-
-class BertTokenizer(tf.keras.layers.Layer):
-  """Wraps BertTokenizer with pre-defined vocab as a Keras Layer.
-
-  Attributes:
-    tokenize_with_offsets: If true, calls
-      `text.BertTokenizer.tokenize_with_offsets()` instead of plain
-      `text.BertTokenizer.tokenize()` and outputs a triple of
-      `(tokens, start_offsets, limit_offsets)`.
-    raw_table_access: An object with methods `.lookup(keys) and `.size()`
-      that operate on the raw lookup table of tokens. It can be used to
-      look up special token synbols like `[MASK]`.
-  """
-
-  def __init__(self, *,
-               vocab_file: str,
-               lower_case: bool,
-               tokenize_with_offsets: bool = False,
-               **kwargs):
-    """Initialize a `BertTokenizer` layer.
-
-    Args:
-      vocab_file: A Python string with the path of the vocabulary file.
-        This is a text file with newline-separated wordpiece tokens.
-        This layer initializes a lookup table from it that gets used with
-        `text.BertTokenizer`.
-      lower_case: A Python boolean forwarded to `text.BertTokenizer`.
-        If true, input text is converted to lower case (where applicable)
-        before tokenization. This must be set to match the way in which
-        the `vocab_file` was created.
-      tokenize_with_offsets: A Python boolean. If true, this layer calls
-        `text.BertTokenizer.tokenize_with_offsets()` instead of plain
-        `text.BertTokenizer.tokenize()` and outputs a triple of
-        `(tokens, start_offsets, limit_offsets)`
-        insead of just tokens.
-      **kwargs: Standard arguments to `Layer()`.
-
-    Raises:
-      ImportError: If importing `tensorflow_text` failed.
-    """
-    _check_if_tf_text_installed()
-
-    self.tokenize_with_offsets = tokenize_with_offsets
-    # TODO(b/177326279): Stop storing the vocab table initializer as an
-    # attribute when https://github.com/tensorflow/tensorflow/issues/46456
-    # has been fixed in the TensorFlow versions of the TF Hub users that load
-    # a SavedModel created from this layer. Due to that issue, loading such a
-    # SavedModel forgets to add .vocab_table._initializer as a trackable
-    # dependency of .vocab_table, so that saving it again to a second SavedModel
-    # (e.g., the final model built using TF Hub) does not properly track
-    # the ._vocab_table._initializer._filename as an Asset.
-    self._vocab_table, self._vocab_initializer_donotuse = (
-        self._create_vocab_table_and_initializer(vocab_file))
-    self._special_tokens_dict = self._create_special_tokens_dict(
-        self._vocab_table, vocab_file)
-    super().__init__(**kwargs)
-    self._bert_tokenizer = text.BertTokenizer(
-        self._vocab_table, lower_case=lower_case)
-
-  @property
-  def vocab_size(self):
-    return self._vocab_table.size()
-
-  def _create_vocab_table_and_initializer(self, vocab_file):
-    vocab_initializer = tf.lookup.TextFileInitializer(
-        vocab_file,
-        key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
-        value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER)
-    vocab_table = tf.lookup.StaticHashTable(vocab_initializer, default_value=-1)
-    return vocab_table, vocab_initializer
-
-  def call(self, inputs: tf.Tensor):
-    """Calls `text.BertTokenizer` on inputs.
-
-    Args:
-      inputs: A string Tensor of shape `(batch_size,)`.
-
-    Returns:
-      One or three of `RaggedTensors` if `tokenize_with_offsets` is False or
-      True, respectively. These are
-        tokens: A `RaggedTensor` of shape
-          `[batch_size, (words), (pieces_per_word)]`
-          and type int32. `tokens[i,j,k]` contains the k-th wordpiece of the
-          j-th word in the i-th input.
-        start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
-          RaggedTensors of type int64 with the same indices as tokens.
-          Element `[i,j,k]` contains the byte offset at the start, or past the
-          end, resp., for the k-th wordpiece of the j-th word in the i-th input.
-    """
-    # Prepare to reshape the result to work around broken shape inference.
-    batch_size = tf.shape(inputs)[0]
-    def _reshape(rt):
-      values = rt.values
-      row_splits = rt.row_splits
-      row_splits = tf.reshape(row_splits, [batch_size + 1])
-      return tf.RaggedTensor.from_row_splits(values, row_splits)
-
-    # Call the tokenizer.
-    if self.tokenize_with_offsets:
-      tokens, start_offsets, limit_offsets = (
-          self._bert_tokenizer.tokenize_with_offsets(inputs))
-      tokens = tf.cast(tokens, dtype=tf.int32)
-      return _reshape(tokens), _reshape(start_offsets), _reshape(limit_offsets)
-    else:
-      tokens = self._bert_tokenizer.tokenize(inputs)
-      tokens = tf.cast(tokens, dtype=tf.int32)
-      return _reshape(tokens)
-
-  def get_config(self):
-    # Skip in tf.saved_model.save(); fail if called direcly.
-    raise NotImplementedError("TODO(b/170480226): implement")
-
-  def get_special_tokens_dict(self):
-    """Returns dict of token ids, keyed by standard names for their purpose.
-
-    Returns:
-      A dict from Python strings to Python integers. Each key is a standard
-      name for a special token describing its use. (For example, "padding_id"
-      is what BERT traditionally calls "[PAD]" but others may call "<pad>".)
-      The corresponding value is the integer token id. If a special token
-      is not found, its entry is omitted from the dict.
-
-      The supported keys and tokens are:
-        * start_of_sequence_id: looked up from "[CLS]"
-        * end_of_segment_id: looked up from "[SEP]"
-        * padding_id: looked up form "[PAD]"
-        * mask_id: looked up from "[MASK]"
-        * vocab_size: one past the largest token id used
-    """
-    return self._special_tokens_dict
-
-  def _create_special_tokens_dict(self, vocab_table, vocab_file):
-    special_tokens = dict(start_of_sequence_id="[CLS]",
-                          end_of_segment_id="[SEP]",
-                          padding_id="[PAD]",
-                          mask_id="[MASK]")
-    with tf.init_scope():
-      if tf.executing_eagerly():
-        special_token_ids = vocab_table.lookup(
-            tf.constant(list(special_tokens.values()), tf.string))
-        vocab_size = vocab_table.size()
-      else:
-        # A blast from the past: non-eager init context while building Model.
-        # This can happen with Estimator or tf.compat.v1.disable_v2_behavior().
-        logging.warning(
-            "Non-eager init context; computing "
-            "BertTokenizer's special_tokens_dict in tf.compat.v1.Session")
-        with tf.Graph().as_default():
-          local_vocab_table, _ = self._create_vocab_table_and_initializer(
-              vocab_file)
-          special_token_ids_tensor = local_vocab_table.lookup(
-              tf.constant(list(special_tokens.values()), tf.string))
-          vocab_size_tensor = local_vocab_table.size()
-          init_ops = [tf.compat.v1.initialize_all_tables()]
-          with tf.compat.v1.Session() as sess:
-            sess.run(init_ops)
-            special_token_ids, vocab_size = sess.run(
-                [special_token_ids_tensor, vocab_size_tensor])
-      result = dict(
-          vocab_size=int(vocab_size)  # Numpy to Python.
-      )
-      for k, v in zip(special_tokens, special_token_ids):
-        v = int(v)
-        if v >= 0:
-          result[k] = v
-        else:
-          logging.warning("Could not find %s as token \"%s\" in vocab file %s",
-                          k, special_tokens[k], vocab_file)
-    return result
-
-
-class SentencepieceTokenizer(tf.keras.layers.Layer):
-  """Wraps `tf_text.SentencepieceTokenizer` as a Keras Layer.
-
-  Attributes:
-    tokenize_with_offsets: If true, calls
-      `SentencepieceTokenizer.tokenize_with_offsets()`
-      instead of plain `.tokenize()` and outputs a triple of
-      `(tokens, start_offsets, limit_offsets)`.
-  """
-
-  def __init__(self,
-               *,
-               lower_case: bool,
-               model_file_path: Optional[str] = None,
-               model_serialized_proto: Optional[str] = None,
-               tokenize_with_offsets: bool = False,
-               nbest_size: int = 0,
-               alpha: float = 1.0,
-               strip_diacritics: bool = False,
-               **kwargs):
-    """Initializes a SentencepieceTokenizer layer.
-
-    Args:
-      lower_case: A Python boolean indicating whether to lowercase the string
-        before tokenization. NOTE: New models are encouraged to build `*_cf`
-        (case folding) normalization into the Sentencepiece model itself and
-        avoid this extra step.
-      model_file_path: A Python string with the path of the sentencepiece model.
-        Exactly one of `model_file_path` and `model_serialized_proto` can be
-        specified. In either case, the Keras model config for this layer will
-        store the actual proto (not a filename passed here).
-      model_serialized_proto: The sentencepiece model serialized proto string.
-      tokenize_with_offsets: A Python boolean. If true, this layer calls
-        `SentencepieceTokenizer.tokenize_with_offsets()` instead of
-        plain `.tokenize()` and outputs a triple of
-        `(tokens, start_offsets, limit_offsets)` insead of just tokens.
-        Note that when following `strip_diacritics` is set to True, returning
-        offsets is not supported now.
-      nbest_size: A scalar for sampling:
-        nbest_size = {0,1}: No sampling is performed. (default)
-        nbest_size > 1: samples from the nbest_size results.
-        nbest_size < 0: assuming that nbest_size is infinite and samples
-           from the all hypothesis (lattice) using
-           forward-filtering-and-backward-sampling algorithm.
-      alpha: A scalar for a smoothing parameter. Inverse temperature for
-        probability rescaling.
-      strip_diacritics: Whether to strip diacritics or not. Note that stripping
-        diacritics requires additional text normalization and dropping bytes,
-        which makes it impossible to keep track of the offsets now. Hence
-        when `strip_diacritics` is set to True, we don't yet support
-        `tokenize_with_offsets`. NOTE: New models are encouraged to put this
-        into custom normalization rules for the Sentencepiece model itself to
-        avoid this extra step and the limitation regarding offsets.
-      **kwargs: standard arguments to `Layer()`.
-
-    Raises:
-      ImportError: if importing tensorflow_text failed.
-    """
-    _check_if_tf_text_installed()
-    super().__init__(**kwargs)
-    if bool(model_file_path) == bool(model_serialized_proto):
-      raise ValueError("Exact one of `model_file_path` and "
-                       "`model_serialized_proto` can be specified.")
-    # TODO(b/181866850): Support tokenize_with_offsets for strip_diacritics=True
-    if tokenize_with_offsets and strip_diacritics:
-      raise ValueError("`tokenize_with_offsets` is not supported when "
-                       "`strip_diacritics` is set to True.")
-    if model_file_path:
-      self._model_serialized_proto = tf.io.gfile.GFile(model_file_path,
-                                                       "rb").read()
-    else:
-      self._model_serialized_proto = model_serialized_proto
-
-    self._lower_case = lower_case
-    self.tokenize_with_offsets = tokenize_with_offsets
-    self._nbest_size = nbest_size
-    self._alpha = alpha
-    self._strip_diacritics = strip_diacritics
-    self._tokenizer = self._create_tokenizer()
-    self._special_tokens_dict = self._create_special_tokens_dict()
-
-  def _create_tokenizer(self):
-    return text.SentencepieceTokenizer(
-        model=self._model_serialized_proto,
-        out_type=tf.int32,
-        nbest_size=self._nbest_size,
-        alpha=self._alpha)
-
-  @property
-  def vocab_size(self):
-    return self._tokenizer.vocab_size()
-
-  def call(self, inputs: tf.Tensor):
-    """Calls `text.SentencepieceTokenizer` on inputs.
-
-    Args:
-      inputs: A string Tensor of shape `(batch_size,)`.
-
-    Returns:
-      One or three of RaggedTensors if tokenize_with_offsets is False or True,
-      respectively. These are
-      tokens: A RaggedTensor of shape `[batch_size, (pieces)]` and type `int32`.
-        `tokens[i,j]` contains the j-th piece in the i-th input.
-      start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
-        RaggedTensors of type `int64` with the same indices as tokens.
-        Element `[i,j]` contains the byte offset at the start, or past the
-        end, resp., for the j-th piece in the i-th input.
-    """
-    if self._strip_diacritics:
-      if self.tokenize_with_offsets:
-        raise ValueError("`tokenize_with_offsets` is not supported yet when "
-                         "`strip_diacritics` is set to True (b/181866850).")
-      inputs = text.normalize_utf8(inputs, "NFD")
-      inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "")
-
-    if self._lower_case:
-      inputs = text.case_fold_utf8(inputs)
-
-    # Prepare to reshape the result to work around broken shape inference.
-    batch_size = tf.shape(inputs)[0]
-    def _reshape(rt):
-      values = rt.values
-      row_splits = rt.row_splits
-      row_splits = tf.reshape(row_splits, [batch_size + 1])
-      return tf.RaggedTensor.from_row_splits(values, row_splits)
-
-    # Call the tokenizer.
-    if self.tokenize_with_offsets:
-      tokens, start_offsets, limit_offsets = (
-          self._tokenizer.tokenize_with_offsets(inputs))
-      return _reshape(tokens), _reshape(start_offsets), _reshape(limit_offsets)
-    else:
-      tokens = self._tokenizer.tokenize(inputs)
-      return _reshape(tokens)
-
-  def get_config(self):
-    # Skip in tf.saved_model.save(); fail if called direcly.
-    raise NotImplementedError("TODO(b/170480226): implement")
-
-  def get_special_tokens_dict(self):
-    """Returns dict of token ids, keyed by standard names for their purpose.
-
-    Returns:
-      A dict from Python strings to Python integers. Each key is a standard
-      name for a special token describing its use. (For example, "padding_id"
-      is what Sentencepiece calls "<pad>" but others may call "[PAD]".)
-      The corresponding value is the integer token id. If a special token
-      is not found, its entry is omitted from the dict.
-
-      The supported keys and tokens are:
-        * start_of_sequence_id: looked up from "[CLS]"
-        * end_of_segment_id: looked up from "[SEP]"
-        * padding_id: looked up from "<pad>"
-        * mask_id: looked up from "[MASK]"
-        * vocab_size: one past the largest token id used
-    """
-    return self._special_tokens_dict
-
-  def _create_special_tokens_dict(self):
-    special_tokens = dict(
-        start_of_sequence_id=b"[CLS]",
-        end_of_segment_id=b"[SEP]",
-        padding_id=b"<pad>",
-        mask_id=b"[MASK]")
-    with tf.init_scope():
-      if tf.executing_eagerly():
-        special_token_ids = self._tokenizer.string_to_id(
-            tf.constant(list(special_tokens.values()), tf.string))
-        inverse_tokens = self._tokenizer.id_to_string(special_token_ids)
-        vocab_size = self._tokenizer.vocab_size()
-      else:
-        # A blast from the past: non-eager init context while building Model.
-        # This can happen with Estimator or tf.compat.v1.disable_v2_behavior().
-        logging.warning(
-            "Non-eager init context; computing SentencepieceTokenizer's "
-            "special_tokens_dict in tf.compat.v1.Session")
-        with tf.Graph().as_default():
-          local_tokenizer = self._create_tokenizer()
-          special_token_ids_tensor = local_tokenizer.string_to_id(
-              tf.constant(list(special_tokens.values()), tf.string))
-          inverse_tokens_tensor = local_tokenizer.id_to_string(
-              special_token_ids_tensor)
-          vocab_size_tensor = local_tokenizer.vocab_size()
-          with tf.compat.v1.Session() as sess:
-            special_token_ids, inverse_tokens, vocab_size = sess.run(
-                [special_token_ids_tensor, inverse_tokens_tensor,
-                 vocab_size_tensor])
-      result = dict(
-          vocab_size=int(vocab_size)  # Numpy to Python.
-      )
-      for name, token_id, inverse_token in zip(special_tokens,
-                                               special_token_ids,
-                                               inverse_tokens):
-        if special_tokens[name] == inverse_token:
-          result[name] = int(token_id)
-        else:
-          logging.warning(
-              "Could not find %s as token \"%s\" in sentencepiece model, "
-              "got \"%s\"", name, special_tokens[name], inverse_token)
-    return result
-
-
-class BertPackInputs(tf.keras.layers.Layer):
-  """Packs tokens into model inputs for BERT."""
-
-  def __init__(self,
-               seq_length,
-               *,
-               start_of_sequence_id=None,
-               end_of_segment_id=None,
-               padding_id=None,
-               special_tokens_dict=None,
-               truncator="round_robin",
-               **kwargs):
-    """Initializes with a target `seq_length`, relevant token ids and truncator.
-
-    Args:
-      seq_length: The desired output length. Must not exceed the max_seq_length
-        that was fixed at training time for the BERT model receiving the inputs.
-      start_of_sequence_id: The numeric id of the token that is to be placed
-        at the start of each sequence (called "[CLS]" for BERT).
-      end_of_segment_id: The numeric id of the token that is to be placed
-        at the end of each input segment (called "[SEP]" for BERT).
-      padding_id: The numeric id of the token that is to be placed into the
-        unused positions after the last segment in the sequence
-        (called "[PAD]" for BERT).
-      special_tokens_dict: Optionally, a dict from Python strings to Python
-        integers that contains values for `start_of_sequence_id`,
-        `end_of_segment_id` and `padding_id`. (Further values in the dict are
-        silenty ignored.) If this is passed, separate *_id arguments must be
-        omitted.
-      truncator: The algorithm to truncate a list of batched segments to fit a
-        per-example length limit. The value can be either `round_robin` or
-        `waterfall`:
-          (1) For "round_robin" algorithm, available space is assigned
-          one token at a time in a round-robin fashion to the inputs that still
-          need some, until the limit is reached. It currently only supports
-          one or two segments.
-          (2) For "waterfall" algorithm, the allocation of the budget is done
-            using a "waterfall" algorithm that allocates quota in a
-            left-to-right manner and fills up the buckets until we run out of
-            budget. It support arbitrary number of segments.
-
-      **kwargs: standard arguments to `Layer()`.
-
-    Raises:
-      ImportError: if importing `tensorflow_text` failed.
-    """
-    _check_if_tf_text_installed()
-    super().__init__(**kwargs)
-    self.seq_length = seq_length
-    if truncator not in ("round_robin", "waterfall"):
-      raise ValueError("Only 'round_robin' and 'waterfall' algorithms are "
-                       "supported, but got %s" % truncator)
-    self.truncator = truncator
-    self._init_token_ids(
-        start_of_sequence_id=start_of_sequence_id,
-        end_of_segment_id=end_of_segment_id,
-        padding_id=padding_id,
-        special_tokens_dict=special_tokens_dict)
-
-  def _init_token_ids(
-      self, *,
-      start_of_sequence_id,
-      end_of_segment_id,
-      padding_id,
-      special_tokens_dict):
-    usage = ("Must pass either all of start_of_sequence_id, end_of_segment_id, "
-             "padding_id as arguments, or else a special_tokens_dict "
-             "with those keys.")
-    special_tokens_args = [start_of_sequence_id, end_of_segment_id, padding_id]
-    if special_tokens_dict is None:
-      if any(x is None for x in special_tokens_args):
-        return ValueError(usage)
-      self.start_of_sequence_id = int(start_of_sequence_id)
-      self.end_of_segment_id = int(end_of_segment_id)
-      self.padding_id = int(padding_id)
-    else:
-      if any(x is not None for x in special_tokens_args):
-        return ValueError(usage)
-      self.start_of_sequence_id = int(
-          special_tokens_dict["start_of_sequence_id"])
-      self.end_of_segment_id = int(special_tokens_dict["end_of_segment_id"])
-      self.padding_id = int(special_tokens_dict["padding_id"])
-
-  def get_config(self) -> Dict[str, Any]:
-    config = super().get_config()
-    config["seq_length"] = self.seq_length
-    config["start_of_sequence_id"] = self.start_of_sequence_id
-    config["end_of_segment_id"] = self.end_of_segment_id
-    config["padding_id"] = self.padding_id
-    config["truncator"] = self.truncator
-    return config
-
-  def call(self, inputs: Union[tf.RaggedTensor, List[tf.RaggedTensor]]):
-    """Adds special tokens to pack a list of segments into BERT input Tensors.
-
-    Args:
-      inputs: A Python list of one or two RaggedTensors, each with the batched
-        values one input segment. The j-th segment of the i-th input example
-        consists of slice `inputs[j][i, ...]`.
-
-    Returns:
-      A nest of Tensors for use as input to the BERT TransformerEncoder.
-    """
-    # BertPackInputsSavedModelWrapper relies on only calling bert_pack_inputs()
-    return BertPackInputs.bert_pack_inputs(
-        inputs, self.seq_length,
-        start_of_sequence_id=self.start_of_sequence_id,
-        end_of_segment_id=self.end_of_segment_id,
-        padding_id=self.padding_id,
-        truncator=self.truncator)
-
-  @staticmethod
-  def bert_pack_inputs(inputs: Union[tf.RaggedTensor, List[tf.RaggedTensor]],
-                       seq_length: Union[int, tf.Tensor],
-                       start_of_sequence_id: Union[int, tf.Tensor],
-                       end_of_segment_id: Union[int, tf.Tensor],
-                       padding_id: Union[int, tf.Tensor],
-                       truncator="round_robin"):
-    """Freestanding equivalent of the BertPackInputs layer."""
-    _check_if_tf_text_installed()
-    # Sanitize inputs.
-    if not isinstance(inputs, (list, tuple)):
-      inputs = [inputs]
-    if not inputs:
-      raise ValueError("At least one input is required for packing")
-    input_ranks = [rt.shape.rank for rt in inputs]
-    if None in input_ranks or len(set(input_ranks)) > 1:
-      raise ValueError("All inputs for packing must have the same known rank, "
-                       "found ranks " + ",".join(input_ranks))
-    # Flatten inputs to [batch_size, (tokens)].
-    if input_ranks[0] > 2:
-      inputs = [rt.merge_dims(1, -1) for rt in inputs]
-    # In case inputs weren't truncated (as they should have been),
-    # fall back to some ad-hoc truncation.
-    num_special_tokens = len(inputs) + 1
-    if truncator == "round_robin":
-      trimmed_segments = round_robin_truncate_inputs(
-          inputs, seq_length - num_special_tokens)
-    elif truncator == "waterfall":
-      trimmed_segments = text.WaterfallTrimmer(
-          seq_length - num_special_tokens).trim(inputs)
-    else:
-      raise ValueError("Unsupported truncator: %s" % truncator)
-    # Combine segments.
-    segments_combined, segment_ids = text.combine_segments(
-        trimmed_segments,
-        start_of_sequence_id=start_of_sequence_id,
-        end_of_segment_id=end_of_segment_id)
-    # Pad to dense Tensors.
-    input_word_ids, _ = text.pad_model_inputs(segments_combined, seq_length,
-                                              pad_value=padding_id)
-    input_type_ids, input_mask = text.pad_model_inputs(segment_ids, seq_length,
-                                                       pad_value=0)
-    # Work around broken shape inference.
-    output_shape = tf.stack([
-        inputs[0].nrows(out_type=tf.int32),  # batch_size
-        tf.cast(seq_length, dtype=tf.int32)])
-    def _reshape(t):
-      return tf.reshape(t, output_shape)
-    # Assemble nest of input tensors as expected by BERT TransformerEncoder.
-    return dict(input_word_ids=_reshape(input_word_ids),
-                input_mask=_reshape(input_mask),
-                input_type_ids=_reshape(input_type_ids))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/text_layers_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/text_layers_test.py
deleted file mode 100644
index c636a0cdcb8329e465bada6c69fd5e41345bd8e7..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/text_layers_test.py
+++ /dev/null
@@ -1,558 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests bert.text_layers."""
-
-import os
-import tempfile
-
-import numpy as np
-import tensorflow as tf
-
-from sentencepiece import SentencePieceTrainer
-from official.nlp.modeling.layers import text_layers
-
-
-class RoundRobinTruncatorTest(tf.test.TestCase):
-
-  def _test_input(self, start, lengths):
-    return tf.ragged.constant([[start + 10 * j + i
-                                for i in range(length)]
-                               for j, length in enumerate(lengths)],
-                              dtype=tf.int32)
-
-  def test_single_segment(self):
-    # Single segment.
-    single_input = self._test_input(11, [4, 5, 6])
-    expected_single_output = tf.ragged.constant(
-        [[11, 12, 13, 14],
-         [21, 22, 23, 24, 25],
-         [31, 32, 33, 34, 35],  # Truncated.
-        ])
-
-    self.assertAllEqual(
-        expected_single_output,
-        text_layers.round_robin_truncate_inputs(single_input, limit=5))
-    # Test wrapping in a singleton list.
-    actual_single_list_output = text_layers.round_robin_truncate_inputs(
-        [single_input], limit=5)
-    self.assertIsInstance(actual_single_list_output, list)
-    self.assertAllEqual(expected_single_output, actual_single_list_output[0])
-
-  def test_two_segments(self):
-    input_a = self._test_input(111, [1, 2, 2, 3, 4, 5])
-    input_b = self._test_input(211, [1, 3, 4, 2, 2, 5])
-    expected_a = tf.ragged.constant(
-        [[111],
-         [121, 122],
-         [131, 132],
-         [141, 142, 143],
-         [151, 152, 153],  # Truncated.
-         [161, 162, 163],  # Truncated.
-        ])
-    expected_b = tf.ragged.constant(
-        [[211],
-         [221, 222, 223],
-         [231, 232, 233],  # Truncated.
-         [241, 242],
-         [251, 252],
-         [261, 262],  # Truncated.
-        ])
-    actual_a, actual_b = text_layers.round_robin_truncate_inputs(
-        [input_a, input_b], limit=5)
-    self.assertAllEqual(expected_a, actual_a)
-    self.assertAllEqual(expected_b, actual_b)
-
-  def test_three_segments(self):
-    input_a = self._test_input(111, [1, 2, 2, 3, 4, 5, 1])
-    input_b = self._test_input(211, [1, 3, 4, 2, 2, 5, 8])
-    input_c = self._test_input(311, [1, 3, 4, 2, 2, 5, 10])
-    seg_limit = 8
-    expected_a = tf.ragged.constant([
-        [111],
-        [121, 122],
-        [131, 132],
-        [141, 142, 143],
-        [151, 152, 153, 154],
-        [161, 162, 163],  # Truncated
-        [171]
-    ])
-    expected_b = tf.ragged.constant([
-        [211],
-        [221, 222, 223],
-        [231, 232, 233],  # Truncated
-        [241, 242],
-        [251, 252],
-        [261, 262, 263],  # Truncated
-        [271, 272, 273, 274]  # Truncated
-    ])
-    expected_c = tf.ragged.constant([
-        [311],
-        [321, 322, 323],
-        [331, 332, 333],  # Truncated
-        [341, 342],
-        [351, 352],
-        [361, 362],  # Truncated
-        [371, 372, 373]  # Truncated
-    ])
-    actual_a, actual_b, actual_c = text_layers.round_robin_truncate_inputs(
-        [input_a, input_b, input_c], limit=seg_limit)
-    self.assertAllEqual(expected_a, actual_a)
-    self.assertAllEqual(expected_b, actual_b)
-    self.assertAllEqual(expected_c, actual_c)
-    input_cap = tf.math.reduce_sum(
-        tf.stack([rt.row_lengths() for rt in [input_a, input_b, input_c]]),
-        axis=0)
-    per_example_usage = tf.math.reduce_sum(
-        tf.stack([rt.row_lengths() for rt in [actual_a, actual_b, actual_c]]),
-        axis=0)
-    self.assertTrue(all(per_example_usage <= tf.minimum(seg_limit, input_cap)))
-
-
-# This test covers the in-process behavior of a BertTokenizer layer.
-# For saving, restoring, and the restored behavior (incl. shape inference),
-# see nlp/tools/export_tfhub_lib_test.py.
-class BertTokenizerTest(tf.test.TestCase):
-
-  def _make_vocab_file(self, vocab, filename="vocab.txt"):
-    path = os.path.join(
-        tempfile.mkdtemp(dir=self.get_temp_dir()),  # New subdir each time.
-        filename)
-    with tf.io.gfile.GFile(path, "w") as f:
-      f.write("\n".join(vocab + [""]))
-    return path
-
-  def test_uncased(self):
-    vocab_file = self._make_vocab_file(
-        ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "xy"])
-    bert_tokenize = text_layers.BertTokenizer(
-        vocab_file=vocab_file, lower_case=True)
-    inputs = tf.constant(["abc def", "ABC DEF d"])
-    token_ids = bert_tokenize(inputs)
-    self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
-                                                       [[6], [4, 5], [4]]]))
-    bert_tokenize.tokenize_with_offsets = True
-    token_ids_2, start_offsets, limit_offsets = bert_tokenize(inputs)
-    self.assertAllEqual(token_ids, token_ids_2)
-    self.assertAllEqual(start_offsets, tf.ragged.constant([[[0], [4, 5]],
-                                                           [[0], [4, 5], [8]]]))
-    self.assertAllEqual(limit_offsets, tf.ragged.constant([[[3], [5, 7]],
-                                                           [[3], [5, 7], [9]]]))
-    self.assertEqual(bert_tokenize.vocab_size.numpy(), 8)
-
-  # Repeat the above and test that case matters with lower_case=False.
-  def test_cased(self):
-    vocab_file = self._make_vocab_file(
-        ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "ABC"])
-    bert_tokenize = text_layers.BertTokenizer(
-        vocab_file=vocab_file, lower_case=False, tokenize_with_offsets=True)
-    inputs = tf.constant(["abc def", "ABC DEF"])
-    token_ids, start_offsets, limit_offsets = bert_tokenize(inputs)
-    self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
-                                                       [[7], [1]]]))
-    self.assertAllEqual(start_offsets, tf.ragged.constant([[[0], [4, 5]],
-                                                           [[0], [4]]]))
-    self.assertAllEqual(limit_offsets, tf.ragged.constant([[[3], [5, 7]],
-                                                           [[3], [7]]]))
-
-  def test_special_tokens_complete(self):
-    vocab_file = self._make_vocab_file(
-        ["foo", "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "xy"])
-    bert_tokenize = text_layers.BertTokenizer(
-        vocab_file=vocab_file, lower_case=True)
-    self.assertDictEqual(bert_tokenize.get_special_tokens_dict(),
-                         dict(padding_id=1,
-                              start_of_sequence_id=3,
-                              end_of_segment_id=4,
-                              mask_id=5,
-                              vocab_size=7))
-
-  def test_special_tokens_partial(self):
-    vocab_file = self._make_vocab_file(
-        ["[PAD]", "[CLS]", "[SEP]"])
-    bert_tokenize = text_layers.BertTokenizer(
-        vocab_file=vocab_file, lower_case=True)
-    self.assertDictEqual(bert_tokenize.get_special_tokens_dict(),
-                         dict(padding_id=0,
-                              start_of_sequence_id=1,
-                              end_of_segment_id=2,
-                              vocab_size=3))  # No mask_id,
-
-  def test_special_tokens_in_estimator(self):
-    """Tests getting special tokens without an Eager init context."""
-    vocab_file = self._make_vocab_file(
-        ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "d", "##ef", "abc", "xy"])
-
-    def input_fn():
-      with tf.init_scope():
-        self.assertFalse(tf.executing_eagerly())
-      # Build a preprocessing Model.
-      sentences = tf.keras.layers.Input(shape=[], dtype=tf.string)
-      bert_tokenizer = text_layers.BertTokenizer(
-          vocab_file=vocab_file, lower_case=True)
-      special_tokens_dict = bert_tokenizer.get_special_tokens_dict()
-      for k, v in special_tokens_dict.items():
-        self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
-      tokens = bert_tokenizer(sentences)
-      packed_inputs = text_layers.BertPackInputs(
-          4, special_tokens_dict=special_tokens_dict)(tokens)
-      preprocessing = tf.keras.Model(sentences, packed_inputs)
-      # Map the dataset.
-      ds = tf.data.Dataset.from_tensors(
-          (tf.constant(["abc", "DEF"]), tf.constant([0, 1])))
-      ds = ds.map(lambda features, labels: (preprocessing(features), labels))
-      return ds
-
-    def model_fn(features, labels, mode):
-      del labels  # Unused.
-      return tf.estimator.EstimatorSpec(mode=mode,
-                                        predictions=features["input_word_ids"])
-
-    estimator = tf.estimator.Estimator(model_fn=model_fn)
-    outputs = list(estimator.predict(input_fn))
-    self.assertAllEqual(outputs, np.array([[2, 6, 3, 0],
-                                           [2, 4, 5, 3]]))
-
-
-# This test covers the in-process behavior of a SentencepieceTokenizer layer.
-class SentencepieceTokenizerTest(tf.test.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    # Make a sentencepiece model.
-    tmp_dir = self.get_temp_dir()
-    tempfile.mkdtemp(dir=tmp_dir)
-    vocab = ["a", "b", "c", "d", "e", "abc", "def", "ABC", "DEF"]
-    model_prefix = os.path.join(tmp_dir, "spm_model")
-    input_text_file_path = os.path.join(tmp_dir, "train_input.txt")
-    with tf.io.gfile.GFile(input_text_file_path, "w") as f:
-      f.write(" ".join(vocab + ["\n"]))
-    # Add 7 more tokens: <pad>, <unk>, [CLS], [SEP], [MASK], <s>, </s>.
-    full_vocab_size = len(vocab) + 7
-    flags = dict(
-        model_prefix=model_prefix,
-        model_type="word",
-        input=input_text_file_path,
-        pad_id=0, unk_id=1, control_symbols="[CLS],[SEP],[MASK]",
-        vocab_size=full_vocab_size,
-        bos_id=full_vocab_size-2, eos_id=full_vocab_size-1)
-    SentencePieceTrainer.Train(
-        " ".join(["--{}={}".format(k, v) for k, v in flags.items()]))
-    self._spm_path = model_prefix + ".model"
-
-  def test_uncased(self):
-    sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
-        model_file_path=self._spm_path, lower_case=True, nbest_size=0)
-
-    inputs = tf.constant(["abc def", "ABC DEF d"])
-    token_ids = sentencepiece_tokenizer(inputs)
-    self.assertAllEqual(
-        token_ids,
-        tf.ragged.constant([[8, 12], [8, 12, 11]]))
-    sentencepiece_tokenizer.tokenize_with_offsets = True
-    token_ids_2, start_offsets, limit_offsets = sentencepiece_tokenizer(inputs)
-    self.assertAllEqual(token_ids, token_ids_2)
-    self.assertAllEqual(
-        start_offsets, tf.ragged.constant([[0, 3], [0, 3, 7]]))
-    self.assertAllEqual(
-        limit_offsets, tf.ragged.constant([[3, 7], [3, 7, 9]]))
-    self.assertEqual(sentencepiece_tokenizer.vocab_size.numpy(), 16)
-
-  # Repeat the above and test that case matters with lower_case=False.
-  def test_cased(self):
-    sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
-        model_file_path=self._spm_path,
-        lower_case=False,
-        nbest_size=0,
-        tokenize_with_offsets=False)
-
-    inputs = tf.constant(["abc def", "ABC DEF d"])
-    token_ids = sentencepiece_tokenizer(inputs)
-    self.assertAllEqual(
-        token_ids,
-        tf.ragged.constant([[8, 12], [5, 6, 11]]))
-    sentencepiece_tokenizer.tokenize_with_offsets = True
-    token_ids_2, start_offsets, limit_offsets = sentencepiece_tokenizer(inputs)
-    self.assertAllEqual(token_ids, token_ids_2)
-    self.assertAllEqual(
-        start_offsets,
-        tf.ragged.constant([[0, 3], [0, 3, 7]]))
-    self.assertAllEqual(
-        limit_offsets,
-        tf.ragged.constant([[3, 7], [3, 7, 9]]))
-
-  def test_special_tokens(self):
-    sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
-        model_file_path=self._spm_path, lower_case=True, nbest_size=0)
-    self.assertDictEqual(sentencepiece_tokenizer.get_special_tokens_dict(),
-                         dict(padding_id=0,
-                              start_of_sequence_id=2,
-                              end_of_segment_id=3,
-                              mask_id=4,
-                              vocab_size=16))
-
-  def test_special_tokens_in_estimator(self):
-    """Tests getting special tokens without an Eager init context."""
-
-    def input_fn():
-      with tf.init_scope():
-        self.assertFalse(tf.executing_eagerly())
-      # Build a preprocessing Model.
-      sentences = tf.keras.layers.Input(shape=[], dtype=tf.string)
-      sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
-          model_file_path=self._spm_path, lower_case=True, nbest_size=0)
-      special_tokens_dict = sentencepiece_tokenizer.get_special_tokens_dict()
-      for k, v in special_tokens_dict.items():
-        self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
-      tokens = sentencepiece_tokenizer(sentences)
-      packed_inputs = text_layers.BertPackInputs(
-          4, special_tokens_dict=special_tokens_dict)(tokens)
-      preprocessing = tf.keras.Model(sentences, packed_inputs)
-      # Map the dataset.
-      ds = tf.data.Dataset.from_tensors(
-          (tf.constant(["abc", "DEF"]), tf.constant([0, 1])))
-      ds = ds.map(lambda features, labels: (preprocessing(features), labels))
-      return ds
-
-    def model_fn(features, labels, mode):
-      del labels  # Unused.
-      return tf.estimator.EstimatorSpec(mode=mode,
-                                        predictions=features["input_word_ids"])
-
-    estimator = tf.estimator.Estimator(model_fn=model_fn)
-    outputs = list(estimator.predict(input_fn))
-    self.assertAllEqual(outputs, np.array([[2, 8, 3, 0],
-                                           [2, 12, 3, 0]]))
-
-  def test_strip_diacritics(self):
-    sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
-        model_file_path=self._spm_path,
-        lower_case=True,
-        nbest_size=0,
-        strip_diacritics=True)
-    inputs = tf.constant(["a b c d e", "膬 岣 膷 岣 茅"])
-    token_ids = sentencepiece_tokenizer(inputs)
-    self.assertAllEqual(
-        token_ids,
-        tf.ragged.constant([[7, 9, 10, 11, 13], [7, 9, 10, 11, 13]]))
-
-  def test_fail_on_tokenize_with_offsets_and_strip_diacritics(self):
-    # Raise an error in init().
-    with self.assertRaises(ValueError):
-      text_layers.SentencepieceTokenizer(
-          model_file_path=self._spm_path,
-          tokenize_with_offsets=True,
-          lower_case=True,
-          nbest_size=0,
-          strip_diacritics=True)
-
-    sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
-        model_file_path=self._spm_path,
-        lower_case=True,
-        nbest_size=0,
-        strip_diacritics=True)
-    sentencepiece_tokenizer.tokenize_with_offsets = True
-
-    # Raise an error in call():
-    inputs = tf.constant(["abc def", "ABC DEF d", "脛ffin"])
-    with self.assertRaises(ValueError):
-      sentencepiece_tokenizer(inputs)
-
-  def test_serialize_deserialize(self):
-    self.skipTest("b/170480226")
-    sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
-        model_file_path=self._spm_path,
-        lower_case=False,
-        nbest_size=0,
-        tokenize_with_offsets=False,
-        name="sentencepiece_tokenizer_layer")
-    config = sentencepiece_tokenizer.get_config()
-    new_tokenizer = text_layers.SentencepieceTokenizer.from_config(config)
-    self.assertEqual(config, new_tokenizer.get_config())
-    inputs = tf.constant(["abc def", "ABC DEF d"])
-    token_ids = sentencepiece_tokenizer(inputs)
-    token_ids_2 = new_tokenizer(inputs)
-    self.assertAllEqual(token_ids, token_ids_2)
-
-  # TODO(b/170480226): Remove once tf_hub_export_lib_test.py covers saving.
-  def test_saving(self):
-    sentencepiece_tokenizer = text_layers.SentencepieceTokenizer(
-        model_file_path=self._spm_path, lower_case=True, nbest_size=0)
-    inputs = tf.keras.layers.Input([], dtype=tf.string)
-    outputs = sentencepiece_tokenizer(inputs)
-    model = tf.keras.Model(inputs, outputs)
-    export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
-    model.save(export_path, signatures={})
-
-
-class BertPackInputsTest(tf.test.TestCase):
-
-  def test_round_robin_correct_outputs(self):
-    bpi = text_layers.BertPackInputs(
-        10,
-        start_of_sequence_id=1001,
-        end_of_segment_id=1002,
-        padding_id=999,
-        truncator="round_robin")
-    # Single input, rank 2.
-    bert_inputs = bpi(
-        tf.ragged.constant([[11, 12, 13],
-                            [21, 22, 23, 24, 25, 26, 27, 28, 29, 30]]))
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"],
-        tf.constant([[1001, 11, 12, 13, 1002, 999, 999, 999, 999, 999],
-                     [1001, 21, 22, 23, 24, 25, 26, 27, 28, 1002]]))
-    self.assertAllEqual(
-        bert_inputs["input_mask"],
-        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]))
-    self.assertAllEqual(
-        bert_inputs["input_type_ids"],
-        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
-
-    # Two inputs, rank 3. Truncation does not respect word boundaries.
-    bert_inputs = bpi([
-        tf.ragged.constant([[[111], [112, 113]],
-                            [[121, 122, 123], [124, 125, 126], [127, 128]]]),
-        tf.ragged.constant([[[211, 212], [213]],
-                            [[221, 222], [223, 224, 225], [226, 227, 228]]])
-    ])
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"],
-        tf.constant([[1001, 111, 112, 113, 1002, 211, 212, 213, 1002, 999],
-                     [1001, 121, 122, 123, 124, 1002, 221, 222, 223, 1002]]))
-    self.assertAllEqual(
-        bert_inputs["input_mask"],
-        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
-                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]))
-    self.assertAllEqual(
-        bert_inputs["input_type_ids"],
-        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 1, 0],
-                     [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]]))
-
-    # Three inputs. rank 3.
-    bert_inputs = bpi([
-        tf.ragged.constant([[[111], [112, 113]],
-                            [[121, 122, 123], [124, 125, 126], [127, 128]]]),
-        tf.ragged.constant([[[211, 212], [213]],
-                            [[221, 222], [223, 224, 225], [226, 227, 228]]]),
-        tf.ragged.constant([[[311, 312], [313]],
-                            [[321, 322], [323, 324, 325], [326, 327, 328]]])
-    ])
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"],
-        tf.constant([[1001, 111, 112, 1002, 211, 212, 1002, 311, 312, 1002],
-                     [1001, 121, 122, 1002, 221, 222, 1002, 321, 322, 1002]]))
-
-  def test_waterfall_correct_outputs(self):
-    bpi = text_layers.BertPackInputs(
-        10,
-        start_of_sequence_id=1001,
-        end_of_segment_id=1002,
-        padding_id=999,
-        truncator="waterfall")
-    # Single input, rank 2.
-    bert_inputs = bpi(
-        tf.ragged.constant([[11, 12, 13],
-                            [21, 22, 23, 24, 25, 26, 27, 28, 29, 30]]))
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"],
-        tf.constant([[1001, 11, 12, 13, 1002, 999, 999, 999, 999, 999],
-                     [1001, 21, 22, 23, 24, 25, 26, 27, 28, 1002]]))
-    self.assertAllEqual(
-        bert_inputs["input_mask"],
-        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]))
-    self.assertAllEqual(
-        bert_inputs["input_type_ids"],
-        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
-
-    # Two inputs, rank 3. Truncation does not respect word boundaries.
-    bert_inputs = bpi([
-        tf.ragged.constant([[[111], [112, 113]],
-                            [[121, 122, 123], [124, 125, 126], [127, 128]]]),
-        tf.ragged.constant([[[211, 212], [213]],
-                            [[221, 222], [223, 224, 225], [226, 227, 228]]])
-    ])
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"],
-        tf.constant([[1001, 111, 112, 113, 1002, 211, 212, 213, 1002, 999],
-                     [1001, 121, 122, 123, 124, 125, 126, 127, 1002, 1002]]))
-    self.assertAllEqual(
-        bert_inputs["input_mask"],
-        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
-                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]))
-    self.assertAllEqual(
-        bert_inputs["input_type_ids"],
-        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 1, 0],
-                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]))
-
-    # Three inputs, rank 3. Truncation does not respect word boundaries.
-    bert_inputs = bpi([
-        tf.ragged.constant([[[111], [112, 113]],
-                            [[121, 122, 123], [124, 125, 126], [127, 128]]]),
-        tf.ragged.constant([[[211], [212]],
-                            [[221, 222], [223, 224, 225], [226, 227, 228]]]),
-        tf.ragged.constant([[[311, 312], [313]],
-                            [[321, 322], [323, 324, 325], [326, 327]]])
-    ])
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"],
-        tf.constant([[1001, 111, 112, 113, 1002, 211, 212, 1002, 311, 1002],
-                     [1001, 121, 122, 123, 124, 125, 126, 1002, 1002, 1002]]))
-    self.assertAllEqual(
-        bert_inputs["input_mask"],
-        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]))
-    self.assertAllEqual(
-        bert_inputs["input_type_ids"],
-        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 2, 2],
-                     [0, 0, 0, 0, 0, 0, 0, 0, 1, 2]]))
-
-  def test_special_tokens_dict(self):
-    special_tokens_dict = dict(start_of_sequence_id=1001,
-                               end_of_segment_id=1002,
-                               padding_id=999,
-                               extraneous_key=666)
-    bpi = text_layers.BertPackInputs(10,
-                                     special_tokens_dict=special_tokens_dict)
-    bert_inputs = bpi(
-        tf.ragged.constant([[11, 12, 13],
-                            [21, 22, 23, 24, 25, 26, 27, 28, 29, 30]]))
-    self.assertAllEqual(
-        bert_inputs["input_word_ids"],
-        tf.constant([[1001, 11, 12, 13, 1002, 999, 999, 999, 999, 999],
-                     [1001, 21, 22, 23, 24, 25, 26, 27, 28, 1002]]))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_expand_condense.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_expand_condense.py
deleted file mode 100644
index 24a3253b11d5d079e42fc1fbbb14a4758c4f0c99..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_expand_condense.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""ExpandCondense tensor network layer used in TN-BERT."""
-# pylint: disable=g-classes-have-attributes
-from typing import List, Optional, Text, Any, Dict
-import tensorflow as tf
-
-Layer = tf.keras.layers.Layer
-activations = tf.keras.activations
-initializers = tf.keras.initializers
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class TNExpandCondense(Layer):
-  """A TPU-optimized TensorNetwork layer.
-
-  Designed for use in models that currently use Dense layers to achieve
-  up projection followed by down projection.
-
-  This layer is a TPU-optimized combination of 3 operations:
-  Expand, Apply Activation, and Condense. The layer projects up from
-  `input_shape[-1]` to `input_shape[-1] * proj_multiplier`, applies
-  `self.activation`, and then condenses back to `input_shape[-1]`.
-
-  Note the input shape and output shape will be identical.
-
-  Args:
-    proj_multiplier: Positive integer, multiple of `input_shape[-1]` to project
-      up to. Must be one of `[2, 4, 6, 8]`.
-    use_bias: Boolean, whether the layer uses a bias vector.
-    activation: Activation function to use between Expand and Condense. If you
-      don't specify anything, no activation is applied
-      (ie. "linear" activation: `a(x) = x`).
-    kernel_initializer: Initializer for the weight matrices.
-    bias_initializer: Initializer for the bias vector.
-  Input shape:
-    N-D tensor with shape: `(batch_size, ..., input_shape[-1])`.
-  Output shape:
-    N-D tensor with shape: `(batch_size, ..., input_shape[-1])`.
-  """
-
-  def __init__(self,
-               proj_multiplier: int,
-               use_bias: Optional[bool] = True,
-               activation: Optional[Text] = 'relu',
-               kernel_initializer: Optional[Text] = 'glorot_uniform',
-               bias_initializer: Optional[Text] = 'zeros',
-               **kwargs) -> None:
-
-    # Allow specification of input_dim instead of input_shape,
-    # for compatability with Keras layers that support this
-    if 'input_shape' not in kwargs and 'input_dim' in kwargs:
-      kwargs['input_shape'] = (kwargs.pop('input_dim'),)
-
-    super(TNExpandCondense, self).__init__(**kwargs)
-
-    assert proj_multiplier in [
-        2, 4, 6, 8, 10, 12
-    ], 'proj_multiplier needs to be one of [2, 4, 6, 8, 10, 12]'
-    self.proj_multiplier = proj_multiplier
-
-    self.use_bias = use_bias
-    self.activation = activations.get(activation)
-    self.kernel_initializer = initializers.get(kernel_initializer)
-    self.bias_initializer = initializers.get(bias_initializer)
-
-  def build(self, input_shape: List[int]) -> None:
-    # Disable the attribute-defined-outside-init violations in this function
-    # pylint: disable=attribute-defined-outside-init
-    if input_shape[-1] is None:
-      raise ValueError(
-          'The last dimension of the inputs to `TNExpandCondense` '
-          'should be defined. Found `None`.')
-
-    super(TNExpandCondense, self).build(input_shape)
-
-    self.proj_size = self.proj_multiplier * input_shape[-1]
-
-    assert (self.proj_size // input_shape[-1]) * input_shape[
-        -1] == self.proj_size, (f'{self.proj_size} / {input_shape[-1]} must be '
-                                f'round')
-    assert (input_shape[-1] // 128
-           ) * 128 == input_shape[-1], f'{input_shape[-1]} / 128 must be round'
-
-    self.w1 = self.add_weight(
-        name='w1',
-        shape=(input_shape[-1], input_shape[-1]),
-        trainable=True,
-        initializer=self.kernel_initializer)
-
-    self.w2 = self.add_weight(
-        name='w2',
-        shape=(128, (128 * (self.proj_size // input_shape[-1]))),
-        trainable=True,
-        initializer=self.kernel_initializer)
-
-    self.w3 = self.add_weight(
-        name='w3',
-        shape=(128 * (self.proj_size // input_shape[-1]), 128),
-        trainable=True,
-        initializer=self.kernel_initializer)
-    self.w4 = self.add_weight(
-        name='w4',
-        shape=(input_shape[-1] // 128, 128, input_shape[-1]),
-        trainable=True,
-        initializer=self.kernel_initializer)
-
-    if self.use_bias:
-      self.bias = self.add_weight(
-          name='b',
-          shape=(input_shape[-1] // 128, 1,
-                 128 * (self.proj_size // input_shape[-1])),
-          trainable=True,
-          initializer=self.bias_initializer)
-    else:
-      self.bias = None
-
-  def call(self, inputs: tf.Tensor, **kwargs):
-    orig_shape = tf.shape(inputs)
-    input_dim = inputs.shape[-1]
-    tmp = tf.reshape(inputs, (-1, input_dim))
-    # Shape is (BatchSeq, input_dim)
-
-    # Expansion network
-    tmp = tf.einsum('ab,Qb->aQ', self.w1, tmp)
-    # Note: Letter Q will always represent the BatchSeq axis.
-    tmp = tf.reshape(tmp, (input_dim // 128, 128, -1))
-    tmp = tf.einsum('abQ,bd->aQd', tmp, self.w2)
-
-    # Apply activation and then Condense
-    tmp = self.activation(tmp + self.bias)
-    tmp = tf.einsum('aQd,db->aQb', tmp, self.w3)
-    tmp = tf.einsum('aQb,abd->Qd', tmp, self.w4)
-
-    out = tf.reshape(tmp, orig_shape)
-    return out
-
-  def compute_output_shape(self, input_shape: List[int]) -> List[int]:
-    return input_shape
-
-  def get_config(self) -> Dict[Any, Any]:
-    """Returns the config of the layer.
-
-    The same layer can be reinstantiated later
-    (without its trained weights) from this configuration.
-
-    Returns:
-      Python dictionary containing the configuration of the layer.
-    """
-    config = {}
-
-    # Include the layer-specific arguments
-    args = ['proj_multiplier', 'use_bias']
-    for arg in args:
-      config[arg] = getattr(self, arg)
-
-    # Serialize the activation
-    config['activation'] = activations.serialize(getattr(self, 'activation'))
-
-    # Serialize the initializers
-    decomp_initializers = ['kernel_initializer', 'bias_initializer']
-    for initializer_arg in decomp_initializers:
-      config[initializer_arg] = initializers.serialize(
-          getattr(self, initializer_arg))
-
-    # Get base config
-    base_config = super(TNExpandCondense, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_expand_condense_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_expand_condense_test.py
deleted file mode 100644
index fc5f0ece2debaf7a7fa351baebaf65ecc43b08b8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_expand_condense_test.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for ExpandCondense tensor network layer."""
-
-import os
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-# pylint: disable=g-direct-tensorflow-import
-from keras.testing_utils import layer_test
-from official.nlp.modeling.layers.tn_expand_condense import TNExpandCondense
-
-
-class TNLayerTest(tf.test.TestCase, parameterized.TestCase):
-  """Unit tests for ExpandCondense TN layer.
-  """
-
-  def setUp(self):
-    super(TNLayerTest, self).setUp()
-    self.labels = np.concatenate((np.ones((50, 1)), np.zeros((50, 1))), axis=0)
-
-  def _build_model(self, data, proj_multiple=2):
-    model = tf.keras.models.Sequential()
-    model.add(
-        TNExpandCondense(
-            proj_multiplier=proj_multiple,
-            use_bias=True,
-            activation='relu',
-            input_shape=(data.shape[-1],)))
-    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
-    return model
-
-  @parameterized.parameters((768, 6), (1024, 2))
-  def test_keras_layer(self, input_dim, proj_multiple):
-    self.skipTest('Disable the test for now since it imports '
-                  'keras.testing_utils, will reenable this test after we '
-                  'fix the b/184578869')
-    # TODO(scottzhu): Reenable after fix b/184578869
-    data = np.random.normal(size=(100, input_dim))
-    data = data.astype(np.float32)
-    layer_test(
-        TNExpandCondense,
-        kwargs={
-            'proj_multiplier': proj_multiple,
-            'input_shape': data.shape
-        },
-        input_shape=data.shape,
-        input_data=data,
-        expected_output_shape=(None, data.shape[-1]),
-        expected_output_dtype=data.dtype)
-
-  @parameterized.parameters((768, 6), (1024, 2))
-  def test_train(self, input_dim, proj_multiple):
-    data = np.random.randint(10, size=(100, input_dim))
-    model = self._build_model(data, proj_multiple)
-    tf.random.set_seed(0)
-
-    model.compile(
-        optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
-
-    # Train the model for 5 epochs
-    history = model.fit(data, self.labels, epochs=5, batch_size=32)
-
-    # Check that loss decreases and accuracy increases
-    self.assertGreater(history.history['loss'][0], history.history['loss'][-1])
-    self.assertLess(
-        history.history['accuracy'][0], history.history['accuracy'][-1])
-
-  @parameterized.parameters((768, 6), (1024, 2))
-  def test_weights_change(self, input_dim, proj_multiple):
-    tf.random.set_seed(0)
-    data = np.random.randint(10, size=(100, input_dim))
-    model = self._build_model(data, proj_multiple)
-    model.compile(
-        optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
-
-    before = model.get_weights()
-
-    model.fit(data, self.labels, epochs=5, batch_size=32)
-
-    after = model.get_weights()
-    # Make sure every layer's weights changed
-    for i, _ in enumerate(before):
-      self.assertTrue((after[i] != before[i]).any())
-
-  @parameterized.parameters((768, 6), (1024, 2))
-  def test_output_shape(self, input_dim, proj_multiple):
-    data = np.random.randint(10, size=(100, input_dim))
-    model = self._build_model(data, proj_multiple)
-    input_shape = data.shape
-
-    actual_output_shape = model(data).shape
-    expected_output_shape = model.compute_output_shape(input_shape)
-
-    self.assertEqual(expected_output_shape, actual_output_shape)
-
-  @parameterized.parameters((768, 6), (1024, 2))
-  def test_expandcondense_num_parameters(self, input_dim, proj_multiple):
-    data = np.random.randint(10, size=(100, input_dim))
-    proj_size = proj_multiple * data.shape[-1]
-    model = tf.keras.models.Sequential()
-    model.add(
-        TNExpandCondense(
-            proj_multiplier=proj_multiple,
-            use_bias=True,
-            activation='relu',
-            input_shape=(data.shape[-1],)))
-
-    w1_params = data.shape[-1]**2
-    w2_params = 128 * 128 * (proj_size // data.shape[-1])
-    w3_params = 128 * 128 * (proj_size // data.shape[-1])
-    w4_params = (data.shape[-1] // 128) * 128 * data.shape[-1]
-    bias_params = ((data.shape[-1] // 128) * 128 *
-                   (proj_size // data.shape[-1]))
-
-    expected_num_parameters = (w1_params + w2_params + w3_params +
-                               w4_params) + bias_params
-
-    self.assertEqual(expected_num_parameters, model.count_params())
-
-  @parameterized.parameters((912, 6), (200, 2))
-  def test_incorrect_sizes(self, input_dim, proj_multiple):
-    data = np.random.randint(10, size=(100, input_dim))
-
-    with self.assertRaises(AssertionError):
-      model = self._build_model(data, proj_multiple)
-      model.compile(optimizer='adam', loss='binary_crossentropy')
-
-  @parameterized.parameters((768, 6), (1024, 2))
-  def test_config(self, input_dim, proj_multiple):
-    data = np.random.randint(10, size=(100, input_dim))
-    model = self._build_model(data, proj_multiple)
-
-    expected_num_parameters = model.layers[0].count_params()
-
-    # Serialize model and use config to create new layer
-    model_config = model.get_config()
-    layer_config = model_config['layers'][1]['config']
-
-    new_model = TNExpandCondense.from_config(layer_config)
-
-    # Build the layer so we can count params below
-    new_model.build(layer_config['batch_input_shape'])
-
-    # Check that original layer had same num params as layer built from config
-    self.assertEqual(expected_num_parameters, new_model.count_params())
-
-  @parameterized.parameters((768, 6), (1024, 2))
-  def test_model_save(self, input_dim, proj_multiple):
-    data = np.random.randint(10, size=(100, input_dim))
-    model = self._build_model(data, proj_multiple)
-
-    model.compile(
-        optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
-
-    # Train the model for 5 epochs
-    model.fit(data, self.labels, epochs=5, batch_size=32)
-
-    save_path = os.path.join(self.get_temp_dir(), 'test_model')
-    model.save(save_path)
-    loaded_model = tf.keras.models.load_model(save_path)
-
-    # Compare model predictions and loaded_model predictions
-    self.assertAllEqual(model.predict(data), loaded_model.predict(data))
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_transformer_expand_condense.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_transformer_expand_condense.py
deleted file mode 100644
index 863f6eca8240525fa4f0b45f67765fa8221468cc..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_transformer_expand_condense.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""TN-BERT TNTransformerExpandCondense employing Expand-Condense layer instead of Dense."""
-# pylint: disable=g-classes-have-attributes
-# Import libraries
-
-import gin
-import tensorflow as tf
-
-from official.nlp.modeling.layers.tn_expand_condense import TNExpandCondense
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-@gin.configurable
-class TNTransformerExpandCondense(tf.keras.layers.Layer):
-  """Transformer layer using tensor network Expand-Condense layer.
-
-  This layer implements the Transformer from transformer.py, with a single
-  tensor network layer replacing the usual intermediate and output Dense
-  layers.
-
-  Args:
-    num_attention_heads: Number of attention heads.
-    intermediate_size: Size of the intermediate layer.
-    intermediate_activation: Activation for the intermediate layer.
-    dropout_rate: Dropout probability for the post-attention and output dropout.
-    attention_dropout_rate: Dropout probability for within the attention layer.
-    output_range: the sequence output range, [0, output_range) by slicing the
-      target sequence. `None` means the target sequence is not sliced.
-    kernel_initializer: Initializer for dense layer kernels.
-    bias_initializer: Initializer for dense layer biases.
-    kernel_regularizer: Regularizer for dense layer kernels.
-    bias_regularizer: Regularizer for dense layer biases.
-    activity_regularizer: Regularizer for dense layer activity.
-    kernel_constraint: Constraint for dense layer kernels.
-    bias_constraint: Constraint for dense layer kernels.
-    use_bias: Whether to enable use_bias in attention layer. If set to False,
-      use_bias in attention layer is disabled.
-    norm_first: Whether to normalize inputs to attention and intermediate dense
-      layers. If set False, output of attention and intermediate dense layers is
-      normalized.
-    norm_epsilon: Epsilon value to initialize normalization layers.
-    intermediate_dropout: Dropout probability for intermediate_dropout_layer.
-    attention_initializer: Initializer for kernels of attention layers. If set
-      `None`, attention layers use kernel_initializer as initializer for kernel.
-  """
-
-  def __init__(self,
-               num_attention_heads,
-               intermediate_size,
-               intermediate_activation,
-               dropout_rate=0.0,
-               attention_dropout_rate=0.0,
-               output_range=None,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               use_bias=True,
-               norm_first=False,
-               norm_epsilon=1e-12,
-               intermediate_dropout=0.0,
-               attention_initializer=None,
-               **kwargs):
-    super(TNTransformerExpandCondense, self).__init__(**kwargs)
-
-    self._num_heads = num_attention_heads
-    self._intermediate_size = intermediate_size
-    self._intermediate_activation = intermediate_activation
-    self._attention_dropout_rate = attention_dropout_rate
-    self._dropout_rate = dropout_rate
-    self._output_range = output_range
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
-    self._use_bias = use_bias
-    self._norm_first = norm_first
-    self._norm_epsilon = norm_epsilon
-    self._intermediate_dropout = intermediate_dropout
-    if attention_initializer:
-      self._attention_initializer = tf.keras.initializers.get(
-          attention_initializer)
-    else:
-      self._attention_initializer = self._kernel_initializer
-
-  def build(self, input_shape):
-    input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
-    input_tensor_shape = tf.TensorShape(input_tensor)
-    if len(input_tensor_shape.as_list()) != 3:
-      raise ValueError(
-          "TNTransformerExpandCondense expects a three-dimensional input of "
-          "shape [batch, sequence, width].")
-    batch_size, sequence_length, hidden_size = input_tensor_shape
-
-    if len(input_shape) == 2:
-      mask_tensor_shape = tf.TensorShape(input_shape[1])
-      expected_mask_tensor_shape = tf.TensorShape(
-          [batch_size, sequence_length, sequence_length])
-      if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
-        raise ValueError(
-            "When passing a mask tensor to TNTransformerExpandCondense, the "
-            "mask tensor must be of shape [batch, "
-            "sequence_length, sequence_length] (here %s). Got a "
-            "mask tensor of shape %s." %
-            (expected_mask_tensor_shape, mask_tensor_shape))
-    if hidden_size % self._num_heads != 0:
-      raise ValueError(
-          "The input size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (hidden_size, self._num_heads))
-    self._attention_head_size = int(hidden_size // self._num_heads)
-    common_kwargs = dict(
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint)
-    self._attention_layer = tf.keras.layers.MultiHeadAttention(
-        num_heads=self._num_heads,
-        key_dim=self._attention_head_size,
-        dropout=self._attention_dropout_rate,
-        use_bias=self._use_bias,
-        kernel_initializer=self._attention_initializer,
-        name="self_attention",
-        **common_kwargs)
-    self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-    # Use float32 in layernorm for numeric stability.
-    # It is probably safe in mixed_float16, but we haven't validated this yet.
-    self._attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(
-            name="self_attention_layer_norm",
-            axis=-1,
-            epsilon=self._norm_epsilon,
-            dtype=tf.float32))
-
-    # Substitute Dense layers with a single Expand-Condense layer.
-    self._output_dense = TNExpandCondense(
-        4,
-        use_bias=True,
-        activation=self._intermediate_activation,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer)
-
-    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-    # Use float32 in layernorm for numeric stability.
-    self._output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm",
-        axis=-1,
-        epsilon=self._norm_epsilon,
-        dtype=tf.float32)
-
-    super(TNTransformerExpandCondense, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        "num_attention_heads":
-            self._num_heads,
-        "intermediate_size":
-            self._intermediate_size,
-        "intermediate_activation":
-            self._intermediate_activation,
-        "dropout_rate":
-            self._dropout_rate,
-        "attention_dropout_rate":
-            self._attention_dropout_rate,
-        "output_range":
-            self._output_range,
-        "kernel_initializer":
-            tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            tf.keras.constraints.serialize(self._bias_constraint),
-        "use_bias":
-            self._use_bias,
-        "norm_first":
-            self._norm_first,
-        "norm_epsilon":
-            self._norm_epsilon,
-        "intermediate_dropout":
-            self._intermediate_dropout,
-        "attention_initializer":
-            tf.keras.initializers.serialize(self._attention_initializer)
-    }
-    base_config = super(TNTransformerExpandCondense, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs):
-    if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
-      input_tensor, attention_mask = inputs
-    else:
-      input_tensor, attention_mask = (inputs, None)
-
-    if self._output_range:
-      target_tensor = input_tensor[:, 0:self._output_range, :]
-      attention_mask = attention_mask[:, 0:self._output_range, :]
-    else:
-      if self._norm_first:
-        source_tensor = input_tensor
-        input_tensor = self._attention_layer_norm(input_tensor)
-      target_tensor = input_tensor
-
-    attention_output = self._attention_layer(
-        query=target_tensor, value=input_tensor, attention_mask=attention_mask)
-    attention_output = self._attention_dropout(attention_output)
-    if self._norm_first:
-      attention_output = source_tensor + attention_output
-    else:
-      attention_output = self._attention_layer_norm(target_tensor +
-                                                    attention_output)
-    if self._norm_first:
-      source_attention_output = attention_output
-      attention_output = self._output_layer_norm(attention_output)
-
-    layer_output = self._output_dense(attention_output)
-    layer_output = self._output_dropout(layer_output)
-    # During mixed precision training, attention_output is from layer norm and
-    # is always fp32 for now. Cast layer_output to fp32 for the subsequent
-    # add.
-    layer_output = tf.cast(layer_output, tf.float32)
-    if self._norm_first:
-      layer_output = source_attention_output + layer_output
-    else:
-      layer_output = self._output_layer_norm(layer_output + attention_output)
-
-    return layer_output
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_transformer_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_transformer_test.py
deleted file mode 100644
index 0eb443f561fcf27acbda487fc02e71744d26e823..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/tn_transformer_test.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for TN-BERT transformer."""
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers.tn_transformer_expand_condense import TNTransformerExpandCondense
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-@parameterized.named_parameters(('tn', TNTransformerExpandCondense))
-class TransformerLayerTest(keras_parameterized.TestCase):
-
-  def tearDown(self):
-    super(TransformerLayerTest, self).tearDown()
-    tf.keras.mixed_precision.set_global_policy('float32')
-
-  def test_layer_creation(self, transformer_cls):
-    test_layer = transformer_cls(
-        num_attention_heads=16,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-    sequence_length = 21
-    width = 256
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(data_tensor)
-    # The default output of a transformer layer should be the same as the input.
-    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
-
-  def test_layer_creation_with_mask(self, transformer_cls):
-    test_layer = transformer_cls(
-        num_attention_heads=16,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-    sequence_length = 21
-    width = 256
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    # Create a 2-dimensional input (the first dimension is implicit).
-    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
-    output_tensor = test_layer([data_tensor, mask_tensor])
-    # The default output of a transformer layer should be the same as the input.
-    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
-
-  def test_layer_creation_with_incorrect_mask_fails(self, transformer_cls):
-    test_layer = transformer_cls(
-        num_attention_heads=16,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-    sequence_length = 21
-    width = 256
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    # Create a 2-dimensional input (the first dimension is implicit).
-    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length - 3))
-    with self.assertRaisesRegex(ValueError, 'When passing a mask tensor.*'):
-      _ = test_layer([data_tensor, mask_tensor])
-
-  def test_layer_invocation(self, transformer_cls):
-    test_layer = transformer_cls(
-        num_attention_heads=16,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-    sequence_length = 21
-    width = 256
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(data_tensor)
-
-    # Create a model from the test layer.
-    model = tf.keras.Model(data_tensor, output_tensor)
-
-    # Invoke the model on test data. We can't validate the output data itself
-    # (the NN is too complex) but this will rule out structural runtime errors.
-    batch_size = 6
-    input_data = 16 * np.random.random_sample(
-        (batch_size, sequence_length, width))
-    _ = model.predict(input_data)
-
-  def test_layer_invocation_with_mask(self, transformer_cls):
-    test_layer = transformer_cls(
-        num_attention_heads=16,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-    sequence_length = 21
-    width = 256
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    # Create a 2-dimensional input (the first dimension is implicit).
-    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
-    output_tensor = test_layer([data_tensor, mask_tensor])
-
-    # Create a model from the test layer.
-    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
-
-    # Invoke the model on test data. We can't validate the output data itself
-    # (the NN is too complex) but this will rule out structural runtime errors.
-    batch_size = 6
-    input_data = 16 * np.random.random_sample(
-        (batch_size, sequence_length, width))
-    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
-    # which here is (batch, sequence_length, sequence_length)
-    mask_data = np.random.randint(
-        2, size=(batch_size, sequence_length, sequence_length))
-    _ = model.predict([input_data, mask_data])
-
-  def test_layer_output_range(self, transformer_cls):
-    test_layer = transformer_cls(
-        num_attention_heads=16,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-    sequence_length = 21
-    width = 256
-
-    batch_size = 6
-    input_data = 16 * np.random.random_sample(
-        (batch_size, sequence_length, width))
-    mask_data = np.random.randint(
-        2, size=(batch_size, sequence_length, sequence_length))
-    output_tensor = test_layer([input_data, mask_data])
-
-    # The layer only attends to the first token and outputs the first token
-    # embeeding.
-    new_layer = transformer_cls(
-        num_attention_heads=16,
-        intermediate_size=2048,
-        intermediate_activation='relu',
-        output_range=1)
-    _ = new_layer([input_data, mask_data])
-    new_layer.set_weights(test_layer.get_weights())
-    new_output_tensor = new_layer([input_data, mask_data])
-    self.assertAllClose(
-        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
-
-  def test_layer_invocation_with_float16_dtype(self, transformer_cls):
-    tf.keras.mixed_precision.set_global_policy('mixed_float16')
-    test_layer = transformer_cls(
-        num_attention_heads=16,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-    sequence_length = 21
-    width = 256
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    # Create a 2-dimensional input (the first dimension is implicit).
-    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
-    output_tensor = test_layer([data_tensor, mask_tensor])
-
-    # Create a model from the test layer.
-    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
-
-    # Invoke the model on test data. We can't validate the output data itself
-    # (the NN is too complex) but this will rule out structural runtime errors.
-    batch_size = 6
-    input_data = (16 * np.random.random_sample(
-        (batch_size, sequence_length, width)))
-    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
-    # which here is (batch, sequence_length, sequence_length)
-    mask_data = np.random.randint(
-        2, size=(batch_size, sequence_length, sequence_length))
-    _ = model.predict([input_data, mask_data])
-
-  def test_transform_with_initializer(self, transformer_cls):
-    test_layer = transformer_cls(
-        num_attention_heads=16,
-        intermediate_size=2048,
-        intermediate_activation='relu',
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
-    sequence_length = 21
-    width = 256
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output = test_layer(data_tensor)
-    # The default output of a transformer layer should be the same as the input.
-    self.assertEqual(data_tensor.shape.as_list(), output.shape.as_list())
-
-  def test_dynamic_layer_sequence(self, transformer_cls):
-    test_layer = transformer_cls(
-        num_attention_heads=16,
-        intermediate_size=2048,
-        intermediate_activation='relu',
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
-    # Create a 3-dimensional input (the first dimension is implicit).
-    width = 256
-    input_tensor = tf.keras.Input(shape=(None, width))
-    output_tensor = test_layer(input_tensor)
-    model = tf.keras.Model(input_tensor, output_tensor)
-
-    input_length = 17
-    input_data = np.ones((1, input_length, width))
-    output_data = model.predict(input_data)
-
-    self.assertAllEqual([1, input_length, width], output_data.shape)
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer.py
deleted file mode 100644
index d4f08abef81a28771ab0b315151583847c977c2e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based transformer block layer."""
-# pylint: disable=g-classes-have-attributes
-
-import gin
-import tensorflow as tf
-
-from official.nlp import keras_nlp
-from official.nlp.modeling.layers import attention
-from official.nlp.modeling.layers import multi_channel_attention
-from official.nlp.modeling.layers.util import tf_function_if_eager
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class Transformer(keras_nlp.layers.TransformerEncoderBlock):
-  """Transformer layer.
-
-  This layer implements the Transformer from "Attention Is All You Need".
-  (https://arxiv.org/abs/1706.03762).
-
-  Args:
-    num_attention_heads: Number of attention heads.
-    intermediate_size: Size of the intermediate layer.
-    intermediate_activation: Activation for the intermediate layer.
-    dropout_rate: Dropout probability for the post-attention and output dropout.
-    attention_dropout_rate: Dropout probability for within the attention layer.
-    output_range: the sequence output range, [0, output_range) by slicing the
-      target sequence. `None` means the target sequence is not sliced.
-    kernel_initializer: Initializer for dense layer kernels.
-    bias_initializer: Initializer for dense layer biases.
-    kernel_regularizer: Regularizer for dense layer kernels.
-    bias_regularizer: Regularizer for dense layer biases.
-    activity_regularizer: Regularizer for dense layer activity.
-    kernel_constraint: Constraint for dense layer kernels.
-    bias_constraint: Constraint for dense layer kernels.
-    use_bias: Whether to enable use_bias in attention layer. If set False,
-      use_bias in attention layer is disabled.
-    norm_first: Whether to normalize inputs to attention and intermediate dense
-      layers. If set False, output of attention and intermediate dense layers is
-      normalized.
-    norm_epsilon: Epsilon value to initialize normalization layers.
-    intermediate_dropout: Dropout probability for intermediate_dropout_layer.
-    attention_initializer: Initializer for kernels of attention layers. If set
-      `None`, attention layers use kernel_initializer as initializer for kernel.
-  """
-
-  def __init__(self,
-               num_attention_heads,
-               intermediate_size,
-               intermediate_activation,
-               dropout_rate=0.0,
-               attention_dropout_rate=0.0,
-               output_range=None,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               use_bias=True,
-               norm_first=False,
-               norm_epsilon=1e-12,
-               intermediate_dropout=0.0,
-               attention_initializer=None,
-               **kwargs):
-    super().__init__(
-        num_attention_heads=num_attention_heads,
-        inner_dim=intermediate_size,
-        inner_activation=intermediate_activation,
-        output_dropout=dropout_rate,
-        attention_dropout=attention_dropout_rate,
-        output_range=output_range,
-        kernel_initializer=kernel_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        bias_constraint=bias_constraint,
-        use_bias=use_bias,
-        norm_first=norm_first,
-        norm_epsilon=norm_epsilon,
-        inner_dropout=intermediate_dropout,
-        attention_initializer=attention_initializer,
-        **kwargs)
-
-  def get_config(self):
-    return {
-        "num_attention_heads":
-            self._num_heads,
-        "intermediate_size":
-            self._inner_dim,
-        "intermediate_activation":
-            self._inner_activation,
-        "dropout_rate":
-            self._output_dropout_rate,
-        "attention_dropout_rate":
-            self._attention_dropout_rate,
-        "output_range":
-            self._output_range,
-        "kernel_initializer":
-            tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            tf.keras.constraints.serialize(self._bias_constraint),
-        "use_bias":
-            self._use_bias,
-        "norm_first":
-            self._norm_first,
-        "norm_epsilon":
-            self._norm_epsilon,
-        "intermediate_dropout":
-            self._inner_dropout,
-        "attention_initializer":
-            tf.keras.initializers.serialize(self._attention_initializer)
-    }
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-@gin.configurable
-class CompiledTransformer(Transformer):
-
-  @tf_function_if_eager(experimental_compile=True)
-  def call(self, inputs):
-    return super().call(inputs)
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class TransformerDecoderBlock(tf.keras.layers.Layer):
-  """Single transformer layer for decoder.
-
-  It has three sub-layers:
-  (1) a multi-head self-attention mechanism.
-  (2) a encoder-decoder attention.
-  (3) a positionwise fully connected feed-forward network.
-
-  Args:
-    num_attention_heads: Number of attention heads.
-    intermediate_size: Size of the intermediate layer.
-    intermediate_activation: Activation for the intermediate layer.
-    dropout_rate: Dropout probability for the post-attention and output dropout.
-    attention_dropout_rate: Dropout probability for within the attention layer.
-    multi_channel_cross_attention: Whether to use `MultiChannelAttention` for
-      cross-attention between target sequences and source sequences.
-    kernel_initializer: Initializer for dense layer kernels.
-    bias_initializer: Initializer for dense layer biases.
-    kernel_regularizer: Regularizer for dense layer kernels.
-    bias_regularizer: Regularizer for dense layer biases.
-    activity_regularizer: Regularizer for dense layer activity.
-    kernel_constraint: Constraint for dense layer kernels.
-    bias_constraint: Constraint for dense layer kernels.
-    use_bias: Whether to enable use_bias in attention layer. If set False,
-      use_bias in attention layer is disabled.
-    norm_first: Whether to normalize inputs to attention and intermediate dense
-      layers. If set False, output of attention and intermediate dense layers is
-      normalized.
-    norm_epsilon: Epsilon value to initialize normalization layers.
-    intermediate_dropout: Dropout probability for intermediate_dropout_layer.
-    attention_initializer: Initializer for kernels of attention layers. If set
-      `None`, attention layers use kernel_initializer as initializer for kernel.
-  """
-
-  def __init__(self,
-               num_attention_heads,
-               intermediate_size,
-               intermediate_activation,
-               dropout_rate=0.0,
-               attention_dropout_rate=0.0,
-               multi_channel_cross_attention=False,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               use_bias=True,
-               norm_first=False,
-               norm_epsilon=1e-12,
-               intermediate_dropout=0.0,
-               attention_initializer=None,
-               **kwargs):
-    super().__init__(**kwargs)
-    self.num_attention_heads = num_attention_heads
-    self.intermediate_size = intermediate_size
-    self.intermediate_activation = tf.keras.activations.get(
-        intermediate_activation)
-    self.dropout_rate = dropout_rate
-    self.attention_dropout_rate = attention_dropout_rate
-    self.multi_channel_cross_attention = multi_channel_cross_attention
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
-    self._use_bias = use_bias
-    self._norm_first = norm_first
-    self._norm_epsilon = norm_epsilon
-    self._intermediate_dropout = intermediate_dropout
-    if attention_initializer:
-      self._attention_initializer = tf.keras.initializers.get(
-          attention_initializer)
-    else:
-      self._attention_initializer = self._kernel_initializer
-    if self.multi_channel_cross_attention:
-      self._cross_attention_cls = multi_channel_attention.MultiChannelAttention
-    else:
-      self._cross_attention_cls = attention.MultiHeadAttention
-
-  def build(self, input_shape):
-    target_tensor_shape = tf.TensorShape(input_shape[0])
-    if len(target_tensor_shape.as_list()) != 3:
-      raise ValueError("TransformerLayer expects a three-dimensional input of "
-                       "shape [batch, sequence, width].")
-    hidden_size = target_tensor_shape[2]
-    if hidden_size % self.num_attention_heads != 0:
-      raise ValueError(
-          "The hidden size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (hidden_size, self.num_attention_heads))
-    self.attention_head_size = int(hidden_size) // self.num_attention_heads
-    common_kwargs = dict(
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint)
-    # Self attention.
-    self.self_attention = attention.CachedAttention(
-        num_heads=self.num_attention_heads,
-        key_dim=self.attention_head_size,
-        dropout=self.attention_dropout_rate,
-        use_bias=self._use_bias,
-        kernel_initializer=self._attention_initializer,
-        name="self_attention",
-        **common_kwargs)
-    self.self_attention_output_dense = tf.keras.layers.experimental.EinsumDense(
-        "abc,cd->abd",
-        output_shape=(None, hidden_size),
-        bias_axes="d",
-        kernel_initializer=self._kernel_initializer,
-        name="output",
-        **common_kwargs)
-    self.self_attention_dropout = tf.keras.layers.Dropout(
-        rate=self.dropout_rate)
-    self.self_attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(
-            name="self_attention_layer_norm",
-            axis=-1,
-            epsilon=self._norm_epsilon,
-            dtype="float32"))
-    # Encoder-decoder attention.
-    self.encdec_attention = self._cross_attention_cls(
-        num_heads=self.num_attention_heads,
-        key_dim=self.attention_head_size,
-        dropout=self.attention_dropout_rate,
-        output_shape=hidden_size,
-        use_bias=self._use_bias,
-        kernel_initializer=self._attention_initializer,
-        name="attention/encdec",
-        **common_kwargs)
-
-    self.encdec_attention_dropout = tf.keras.layers.Dropout(
-        rate=self.dropout_rate)
-    self.encdec_attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(
-            name="attention/encdec_output_layer_norm",
-            axis=-1,
-            epsilon=self._norm_epsilon,
-            dtype="float32"))
-
-    # Feed-forward projection.
-    self.intermediate_dense = tf.keras.layers.experimental.EinsumDense(
-        "abc,cd->abd",
-        output_shape=(None, self.intermediate_size),
-        bias_axes="d",
-        kernel_initializer=self._kernel_initializer,
-        name="intermediate",
-        **common_kwargs)
-    self.intermediate_activation_layer = tf.keras.layers.Activation(
-        self.intermediate_activation)
-    self._intermediate_dropout_layer = tf.keras.layers.Dropout(
-        rate=self._intermediate_dropout)
-    self.output_dense = tf.keras.layers.experimental.EinsumDense(
-        "abc,cd->abd",
-        output_shape=(None, hidden_size),
-        bias_axes="d",
-        kernel_initializer=self._kernel_initializer,
-        name="output",
-        **common_kwargs)
-    self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
-    self.output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm",
-        axis=-1,
-        epsilon=self._norm_epsilon,
-        dtype="float32")
-    super().build(input_shape)
-
-  def get_config(self):
-    config = {
-        "num_attention_heads":
-            self.num_attention_heads,
-        "intermediate_size":
-            self.intermediate_size,
-        "intermediate_activation":
-            self.intermediate_activation,
-        "dropout_rate":
-            self.dropout_rate,
-        "attention_dropout_rate":
-            self.attention_dropout_rate,
-        "multi_channel_cross_attention":
-            self.multi_channel_cross_attention,
-        "kernel_initializer":
-            tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            tf.keras.constraints.serialize(self._bias_constraint),
-        "use_bias":
-            self._use_bias,
-        "norm_first":
-            self._norm_first,
-        "norm_epsilon":
-            self._norm_epsilon,
-        "intermediate_dropout":
-            self._intermediate_dropout,
-        "attention_initializer":
-            tf.keras.initializers.serialize(self._attention_initializer)
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def common_layers_with_encoder(self):
-    """Gets layer objects that can make a Transformer encoder block."""
-    return [
-        self.self_attention, self.self_attention_layer_norm,
-        self.intermediate_dense, self.output_dense, self.output_layer_norm
-    ]
-
-  def call(self, inputs, cache=None, decode_loop_step=None):
-    if self.multi_channel_cross_attention:
-      if len(inputs) != 5:
-        raise ValueError(
-            "TransformerDecoderBlock must have 5 inputs, when it uses "
-            "multi_channel_cross_attention. But it got: %d" % len(inputs))
-    elif len(inputs) != 4:
-      raise ValueError(
-          "TransformerDecoderBlock must have 4 inputs, but it got: %d" %
-          len(inputs))
-    input_tensor, memory, attention_mask, self_attention_mask = inputs[:4]
-    source_tensor = input_tensor
-    if self._norm_first:
-      input_tensor = self.self_attention_layer_norm(input_tensor)
-    self_attention_output, cache = self.self_attention(
-        query=input_tensor,
-        value=input_tensor,
-        attention_mask=self_attention_mask,
-        cache=cache,
-        decode_loop_step=decode_loop_step)
-    self_attention_output = self.self_attention_dropout(self_attention_output)
-    if self._norm_first:
-      self_attention_output = source_tensor + self_attention_output
-    else:
-      self_attention_output = self.self_attention_layer_norm(
-          input_tensor + self_attention_output)
-    if self._norm_first:
-      source_self_attention_output = self_attention_output
-      self_attention_output = self.encdec_attention_layer_norm(
-          self_attention_output)
-    cross_attn_inputs = dict(
-        query=self_attention_output,
-        value=memory,
-        attention_mask=attention_mask)
-    if self.multi_channel_cross_attention:
-      # Accesses the 5-th input tensor for the doc-attention probabilities.
-      cross_attn_inputs["context_attention_weights"] = inputs[-1]
-    attention_output = self.encdec_attention(**cross_attn_inputs)
-    attention_output = self.encdec_attention_dropout(attention_output)
-    if self._norm_first:
-      attention_output = source_self_attention_output + attention_output
-    else:
-      attention_output = self.encdec_attention_layer_norm(
-          self_attention_output + attention_output)
-    if self._norm_first:
-      source_attention_output = attention_output
-      attention_output = self.output_layer_norm(attention_output)
-
-    intermediate_output = self.intermediate_dense(attention_output)
-    intermediate_output = self.intermediate_activation_layer(
-        intermediate_output)
-    intermediate_output = self._intermediate_dropout_layer(intermediate_output)
-    layer_output = self.output_dense(intermediate_output)
-    layer_output = self.output_dropout(layer_output)
-    if self._norm_first:
-      layer_output = source_attention_output + layer_output
-    else:
-      layer_output = self.output_layer_norm(layer_output + attention_output)
-    return layer_output, cache
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_scaffold.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_scaffold.py
deleted file mode 100644
index ba01cd90f4c723ffa3856d8001950bfca33fc9c2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_scaffold.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based transformer scaffold layer."""
-# pylint: disable=g-classes-have-attributes
-
-from absl import logging
-import gin
-import tensorflow as tf
-
-from official.nlp.modeling.layers import attention
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-@gin.configurable
-class TransformerScaffold(tf.keras.layers.Layer):
-  """Transformer scaffold layer.
-
-  This layer implements the Transformer from "Attention Is All You Need".
-  (https://arxiv.org/abs/1706.03762), with a customizable attention layer and
-  feedforward layer option. Users can pass a class to
-  `attention_cls`/`feedforward_cls` and associated config to
-  `attention_cfg`/`feedforward_cfg`, in which case the scaffold will
-  instantiate the class with the config, or pass a class instance to
-  `attention_cls`/`feedforward_cls`.
-
-  Args:
-    num_attention_heads: Number of attention heads.
-    intermediate_size: Size of the intermediate layer.
-    intermediate_activation: Activation for the intermediate layer.
-    attention_cls: A class to instantiate attention layer, or a layer instance.
-    attention_cfg: The config with which to instantiate `attention_cls`. Ignored
-      if attention_cls is a layer instance or None. If `attention_cls` is a
-      class, but `attention_cfg` is None, following kwargs will be used to
-      instantiate the attention instance: {
-        "num_heads": num_attention_heads,
-        "key_dim": int(hidden_size // num_attention_heads),
-        "dropout": attention_dropout_rate,
-        "name": "self_attention" }, where `hidden_size` is the input tensor's
-          last dimension.
-    feedforward_cls: A class to instantiate feedforward layer, or a layer
-      instance. If None, will use the standard feedforward layer as described in
-      "Attention Is All You Need" paper. If not None, the instantiated
-      feedforward layer is expected to take the output of attention as input and
-      its output is this transformer layer's output.
-    feedforward_cfg: The config with which to instantiate `feedforward_cls`.
-      Ignored if feedforward_cls is a layer instance or is None. If
-      `feedforward_cls` is a class, but `feedforward_cfg` is None, following
-      kwargs will be used to instantiate the feedforward instance: {
-        "intermediate_size": intermediate_size,
-        "intermediate_activation": intermediate_activation,
-        "dropout": dropout_rate,
-        "name": "feedforward" }.
-    dropout_rate: Dropout probability for the post-attention and output dropout.
-    attention_dropout_rate: Dropout probability for within the attention layer.
-    kernel_initializer: Initializer for dense layer kernels.
-    bias_initializer: Initializer for dense layer biases.
-    kernel_regularizer: Regularizer for dense layer kernels.
-    bias_regularizer: Regularizer for dense layer biases.
-    activity_regularizer: Regularizer for dense layer activity.
-    kernel_constraint: Constraint for dense layer kernels.
-    bias_constraint: Constraint for dense layer kernels.
-  """
-
-  def __init__(self,
-               num_attention_heads,
-               intermediate_size,
-               intermediate_activation,
-               attention_cls=attention.MultiHeadAttention,
-               attention_cfg=None,
-               feedforward_cls=None,
-               feedforward_cfg=None,
-               dropout_rate=0.0,
-               attention_dropout_rate=0.0,
-               norm_first=False,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(TransformerScaffold, self).__init__(**kwargs)
-
-    self._attention_cfg = attention_cfg
-    self._attention_cls = attention_cls
-    self._feedforward_cls = feedforward_cls
-    self._feedforward_cfg = feedforward_cfg
-    self._norm_first = norm_first
-    self._num_heads = num_attention_heads
-    self._intermediate_size = intermediate_size
-    self._intermediate_activation = intermediate_activation
-    self._attention_dropout_rate = attention_dropout_rate
-    self._dropout_rate = dropout_rate
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
-
-  def build(self, input_shape):
-    input_tensor_shape = input_shape[0] if (
-        len(input_shape) == 2) else input_shape
-    input_tensor_shape = tf.TensorShape(input_tensor_shape)
-    if len(input_tensor_shape.as_list()) != 3:
-      raise ValueError(
-          "TransformerScaffold expects a three-dimensional input of "
-          "shape [batch, sequence, width].")
-    hidden_size = input_tensor_shape[-1]
-    if hidden_size % self._num_heads != 0:
-      raise ValueError(
-          "The input size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (hidden_size, self._num_heads))
-    self._attention_head_size = int(hidden_size // self._num_heads)
-
-    common_kwargs = dict(
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint)
-
-    def get_layer_instance(instance_or_cls, config, default_config):
-      if isinstance(instance_or_cls, tf.keras.layers.Layer):
-        return instance_or_cls
-      else:
-        if config is None:
-          return instance_or_cls(**default_config)
-        else:
-          return instance_or_cls(**config)
-
-    default_attention_cfg = {
-        "num_heads": self._num_heads,
-        "key_dim": self._attention_head_size,
-        "dropout": self._attention_dropout_rate,
-        "name": "self_attention"
-    }
-    default_attention_cfg.update(common_kwargs)
-    self._attention_layer = get_layer_instance(
-        self._attention_cls,
-        config=self._attention_cfg,
-        default_config=default_attention_cfg)
-
-    if self._feedforward_cls is not None:
-      default_feedforward_cfg = {
-          "intermediate_size": self._intermediate_size,
-          "intermediate_activation": self._intermediate_activation,
-          "dropout": self._dropout_rate,
-          "name": "feedforward",
-      }
-      default_feedforward_cfg.update(common_kwargs)
-      self._feedforward_block = get_layer_instance(
-          self._feedforward_cls,
-          config=self._feedforward_cfg,
-          default_config=default_feedforward_cfg)
-    else:
-      self._feedforward_block = None
-
-    # self._dropout_rate controls dropout rates at two places:
-    # after attention, and after FFN.
-    self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-    # Use float32 in layernorm for numeric stability.
-    # It is probably safe in mixed_float16, but we haven't validated this yet.
-    self._attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(
-            name="self_attention_layer_norm",
-            axis=-1,
-            epsilon=1e-12,
-            dtype=tf.float32))
-
-    if self._feedforward_block is None:
-      self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
-          "abc,cd->abd",
-          output_shape=(None, self._intermediate_size),
-          bias_axes="d",
-          name="intermediate",
-          **common_kwargs)
-      policy = tf.keras.mixed_precision.global_policy()
-      if policy.name == "mixed_bfloat16":
-        # bfloat16 causes BERT with the LAMB optimizer to not converge
-        # as well, so we use float32.
-        # TODO(b/154538392): Investigate this.
-        policy = tf.float32
-      self._intermediate_activation_layer = tf.keras.layers.Activation(
-          self._intermediate_activation, dtype=policy)
-      self._output_dense = tf.keras.layers.experimental.EinsumDense(
-          "abc,cd->abd",
-          output_shape=(None, hidden_size),
-          bias_axes="d",
-          name="output",
-          **common_kwargs)
-
-    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-    # Use float32 in layernorm for numeric stability.
-    self._output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)
-
-    super(TransformerScaffold, self).build(input_shape)
-    logging.info("%s configs: %s", self.__class__.__name__, self.get_config())
-
-  def get_config(self):
-    config = {
-        "attention_cls":
-            self._attention_layer,
-        "feedforward_cls":
-            self._feedforward_block,
-        "num_attention_heads":
-            self._num_heads,
-        "intermediate_size":
-            self._intermediate_size,
-        "intermediate_activation":
-            self._intermediate_activation,
-        "dropout_rate":
-            self._dropout_rate,
-        "attention_dropout_rate":
-            self._attention_dropout_rate,
-        "norm_first":
-            self._norm_first,
-        "kernel_initializer":
-            tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer":
-            tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer":
-            tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer":
-            tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer":
-            tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint":
-            tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint":
-            tf.keras.constraints.serialize(self._bias_constraint)
-    }
-    base_config = super(TransformerScaffold, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, inputs, training=None):
-    if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
-      input_tensor, attention_mask = inputs
-    else:
-      input_tensor, attention_mask = (inputs, None)
-
-    if self._norm_first:
-      source_tensor = input_tensor
-      input_tensor = self._attention_layer_norm(input_tensor, training=training)
-
-    attention_output = self._attention_layer(
-        query=input_tensor, value=input_tensor, attention_mask=attention_mask,
-        training=training)
-    attention_output = self._attention_dropout(attention_output,
-                                               training=training)
-
-    if self._norm_first:
-      attention_output = source_tensor + attention_output
-    else:
-      attention_output = self._attention_layer_norm(input_tensor +
-                                                    attention_output,
-                                                    training=training)
-    if self._norm_first:
-      source_attention_output = attention_output
-      attention_output = self._output_layer_norm(attention_output,
-                                                 training=training)
-
-    if self._feedforward_block is None:
-      intermediate_output = self._intermediate_dense(attention_output)
-      intermediate_output = self._intermediate_activation_layer(
-          intermediate_output)
-      layer_output = self._output_dense(intermediate_output, training=training)
-      layer_output = self._output_dropout(layer_output, training=training)
-      # During mixed precision training, attention_output is from layer norm
-      # and is always fp32 for now. Cast layer_output to fp32 for the subsequent
-      # add.
-      layer_output = tf.cast(layer_output, tf.float32)
-      if self._norm_first:
-        layer_output = source_attention_output + layer_output
-      else:
-        layer_output = self._output_layer_norm(layer_output + attention_output,
-                                               training=training)
-    else:
-      if self._norm_first:
-        # if norm_first, assume the feedforward block will not apply layer norm
-        layer_output = self._feedforward_block(attention_output,
-                                               training=training)
-        layer_output += source_attention_output
-      else:
-        # if not norm_first, assume that the feedforwad does apply layer norm
-        layer_output = self._feedforward_block(attention_output,
-                                               training=training)
-
-    return layer_output
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_scaffold_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_scaffold_test.py
deleted file mode 100644
index e0d2b618711781142146507caf3fb32c8bd4d5e3..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_scaffold_test.py
+++ /dev/null
@@ -1,525 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for Keras-based transformer block layer."""
-
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import attention
-from official.nlp.modeling.layers import transformer_scaffold
-
-
-# Test class that wraps a standard attention layer. If this layer is called
-# at any point, the list passed to the config object will be filled with a
-# boolean 'True'. We register this class as a Keras serializable so we can
-# test serialization below.
-@tf.keras.utils.register_keras_serializable(package='TestOnlyAttention')
-class ValidatedAttentionLayer(attention.MultiHeadAttention):
-
-  def __init__(self, call_list, **kwargs):
-    super(ValidatedAttentionLayer, self).__init__(**kwargs)
-    self.list = call_list
-
-  def call(self, query, value, attention_mask=None):
-    self.list.append(True)
-    return super(ValidatedAttentionLayer, self).call(
-        query, value, attention_mask=attention_mask)
-
-  def get_config(self):
-    config = super(ValidatedAttentionLayer, self).get_config()
-    config['call_list'] = []
-    return config
-
-
-# Test class implements a simple feedforward layer. If this layer is called
-# at any point, the list passed to the config object will be filled with a
-# boolean 'True'. We register this class as a Keras serializable so we can
-# test serialization below.
-@tf.keras.utils.register_keras_serializable(package='TestOnlyFeedforward')
-class ValidatedFeedforwardLayer(tf.keras.layers.Layer):
-
-  def __init__(self, call_list, activation, **kwargs):
-    super(ValidatedFeedforwardLayer, self).__init__(**kwargs)
-    self.list = call_list
-    self.activation = activation
-
-  def build(self, input_shape):
-    hidden_size = input_shape.as_list()[-1]
-    self._feedforward_dense = tf.keras.layers.experimental.EinsumDense(
-        '...x,xy->...y',
-        output_shape=hidden_size,
-        bias_axes='y',
-        activation=self.activation,
-        name='feedforward')
-
-  def call(self, inputs):
-    self.list.append(True)
-    return self._feedforward_dense(inputs)
-
-  def get_config(self):
-    config = super(ValidatedFeedforwardLayer, self).get_config()
-    config['call_list'] = []
-    config['activation'] = self.activation
-    return config
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class TransformerLayerTest(keras_parameterized.TestCase):
-
-  def tearDown(self):
-    super(TransformerLayerTest, self).tearDown()
-    tf.keras.mixed_precision.set_global_policy('float32')
-
-  def test_layer_creation(self):
-    sequence_length = 21
-    width = 80
-
-    call_list = []
-    attention_layer_cfg = {
-        'num_heads': 10,
-        'key_dim': 8,
-        'call_list': call_list,
-    }
-    test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(data_tensor)
-    # The default output of a transformer layer should be the same as the input.
-    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
-
-    # If call_list[0] exists and is True, the passed layer class was
-    # instantiated from the given config properly.
-    self.assertNotEmpty(call_list)
-    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
-
-  def test_layer_creation_with_feedforward_cls(self):
-    sequence_length = 21
-    width = 80
-
-    call_list = []
-    attention_layer_cfg = {
-        'num_heads': 10,
-        'key_dim': 8,
-        'call_list': call_list,
-    }
-    feedforward_call_list = []
-    feedforward_layer_cfg = {
-        'activation': 'relu',
-        'call_list': feedforward_call_list,
-    }
-    test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        feedforward_cls=ValidatedFeedforwardLayer,
-        feedforward_cfg=feedforward_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=None,
-        intermediate_activation=None)
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(data_tensor)
-    # The default output of a transformer layer should be the same as the input.
-    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
-
-    # If call_list[0] exists and is True, the passed layer class was
-    # instantiated from the given config properly.
-    self.assertNotEmpty(call_list)
-    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
-    self.assertNotEmpty(feedforward_call_list)
-    self.assertTrue(feedforward_call_list[0],
-                    "The passed layer class wasn't instantiated.")
-
-  def test_layer_creation_with_mask(self):
-    sequence_length = 21
-    width = 80
-
-    call_list = []
-    attention_layer_cfg = {
-        'num_heads': 10,
-        'key_dim': 8,
-        'call_list': call_list,
-    }
-    test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    # Create a 2-dimensional input (the first dimension is implicit).
-    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
-    output_tensor = test_layer([data_tensor, mask_tensor])
-    # The default output of a transformer layer should be the same as the input.
-    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
-    # If call_list[0] exists and is True, the passed layer class was
-    # instantiated from the given config properly.
-    self.assertNotEmpty(call_list)
-    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
-
-  def test_layer_invocation(self):
-    sequence_length = 21
-    width = 80
-
-    call_list = []
-    attention_layer_cfg = {
-        'num_heads': 10,
-        'key_dim': 8,
-        'call_list': call_list,
-    }
-    test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output_tensor = test_layer(data_tensor)
-
-    # Create a model from the test layer.
-    model = tf.keras.Model(data_tensor, output_tensor)
-
-    # Invoke the model on test data. We can't validate the output data itself
-    # (the NN is too complex) but this will rule out structural runtime errors.
-    batch_size = 6
-    input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, width))
-    _ = model.predict(input_data)
-    # If call_list[0] exists and is True, the passed layer class was
-    # instantiated from the given config properly.
-    self.assertNotEmpty(call_list)
-    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
-
-  def test_layer_invocation_with_feedforward_cls(self):
-    sequence_length = 21
-    width = 80
-
-    call_list = []
-    attention_layer_cfg = {
-        'num_heads': 10,
-        'key_dim': 8,
-        'call_list': call_list,
-    }
-    feedforward_call_list = []
-    feedforward_layer_cfg = {
-        'activation': 'relu',
-        'call_list': feedforward_call_list,
-    }
-    feedforward_layer = ValidatedFeedforwardLayer(**feedforward_layer_cfg)
-    test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        feedforward_cls=feedforward_layer,
-        num_attention_heads=10,
-        intermediate_size=None,
-        intermediate_activation=None)
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    # Create a 2-dimensional input (the first dimension is implicit).
-    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
-    output_tensor = test_layer([data_tensor, mask_tensor])
-
-    # Create a model from the test layer.
-    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
-
-    # Invoke the model on test data. We can't validate the output data itself
-    # (the NN is too complex) but this will rule out structural runtime errors.
-    batch_size = 6
-    input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, width))
-    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
-    # which here is (batch, sequence_length, sequence_length)
-    mask_data = np.random.randint(
-        2, size=(batch_size, sequence_length, sequence_length))
-    _ = model.predict([input_data, mask_data])
-    # If call_list[0] exists and is True, the passed layer class was
-    # instantiated from the given config properly.
-    self.assertNotEmpty(call_list)
-    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
-    self.assertNotEmpty(feedforward_call_list)
-    self.assertTrue(feedforward_call_list[0],
-                    "The passed layer class wasn't instantiated.")
-
-  def test_layer_invocation_with_mask(self):
-    sequence_length = 21
-    width = 80
-
-    call_list = []
-    attention_layer_cfg = {
-        'num_heads': 10,
-        'key_dim': 8,
-        'call_list': call_list,
-    }
-    test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    # Create a 2-dimensional input (the first dimension is implicit).
-    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
-    output_tensor = test_layer([data_tensor, mask_tensor])
-
-    # Create a model from the test layer.
-    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
-
-    # Invoke the model on test data. We can't validate the output data itself
-    # (the NN is too complex) but this will rule out structural runtime errors.
-    batch_size = 6
-    input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, width))
-    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
-    # which here is (batch, sequence_length, sequence_length)
-    mask_data = np.random.randint(
-        2, size=(batch_size, sequence_length, sequence_length))
-    _ = model.predict([input_data, mask_data])
-    # If call_list[0] exists and is True, the passed layer class was
-    # instantiated from the given config properly.
-    self.assertNotEmpty(call_list)
-    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
-
-  def test_layer_invocation_with_float16_dtype(self):
-    tf.keras.mixed_precision.set_global_policy('mixed_float16')
-    sequence_length = 21
-    width = 80
-
-    call_list = []
-    attention_layer_cfg = {
-        'num_heads': 10,
-        'key_dim': 8,
-        'call_list': call_list,
-    }
-    test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    # Create a 2-dimensional input (the first dimension is implicit).
-    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
-    output_tensor = test_layer([data_tensor, mask_tensor])
-
-    # Create a model from the test layer.
-    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
-
-    # Invoke the model on test data. We can't validate the output data itself
-    # (the NN is too complex) but this will rule out structural runtime errors.
-    batch_size = 6
-    input_data = (10 * np.random.random_sample(
-        (batch_size, sequence_length, width)))
-    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
-    # which here is (batch, sequence_length, sequence_length)
-    mask_data = np.random.randint(
-        2, size=(batch_size, sequence_length, sequence_length))
-    _ = model.predict([input_data, mask_data])
-    # If call_list[0] exists and is True, the passed layer class was
-    # instantiated from the given config properly.
-    self.assertNotEmpty(call_list)
-    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
-
-  def test_transform_with_initializer(self):
-    sequence_length = 21
-    width = 80
-
-    call_list = []
-    attention_layer_cfg = {
-        'num_heads': 10,
-        'key_dim': 8,
-        'call_list': call_list,
-    }
-    test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu',
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    output = test_layer(data_tensor)
-    # The default output of a transformer layer should be the same as the input.
-    self.assertEqual(data_tensor.shape.as_list(), output.shape.as_list())
-    # If call_list[0] exists and is True, the passed layer class was
-    # instantiated from the given config properly.
-    self.assertNotEmpty(call_list)
-    self.assertTrue(call_list[0])
-
-  def test_layer_restoration_from_config(self):
-    sequence_length = 21
-    width = 80
-
-    call_list = []
-    attention_layer_cfg = {
-        'num_heads': 10,
-        'key_dim': 8,
-        'call_list': call_list,
-        'name': 'test_layer',
-    }
-    test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu')
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    # Create a 2-dimensional input (the first dimension is implicit).
-    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
-    output_tensor = test_layer([data_tensor, mask_tensor])
-
-    # Create a model from the test layer.
-    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
-
-    # Invoke the model on test data. We can't validate the output data itself
-    # (the NN is too complex) but this will rule out structural runtime errors.
-    batch_size = 6
-    input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, width))
-    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
-    # which here is (batch, sequence_length, sequence_length)
-    mask_data = np.random.randint(
-        2, size=(batch_size, sequence_length, sequence_length))
-    pre_serialization_output = model.predict([input_data, mask_data])
-
-    # Serialize the model config. Pass the serialized data through json to
-    # ensure that we can serialize this layer to disk.
-    serialized_data = model.get_config()
-
-    # Create a new model from the old config, and copy the weights. These models
-    # should have identical outputs.
-    new_model = tf.keras.Model.from_config(serialized_data)
-    new_model.set_weights(model.get_weights())
-    output = new_model.predict([input_data, mask_data])
-
-    self.assertAllClose(pre_serialization_output, output)
-
-    # If the layer was configured correctly, it should have a list attribute
-    # (since it should have the custom class and config passed to it).
-    new_model.summary()
-    new_call_list = new_model.get_layer(
-        name='transformer_scaffold')._attention_layer.list
-    self.assertNotEmpty(new_call_list)
-    self.assertTrue(new_call_list[0],
-                    "The passed layer class wasn't instantiated.")
-
-  def test_layer_with_feedforward_cls_restoration_from_config(self):
-    sequence_length = 21
-    width = 80
-
-    call_list = []
-    attention_layer_cfg = {
-        'num_heads': 10,
-        'key_dim': 8,
-        'call_list': call_list,
-        'name': 'test_layer',
-    }
-    feedforward_call_list = []
-    feedforward_layer_cfg = {
-        'activation': 'relu',
-        'call_list': feedforward_call_list,
-    }
-    test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        feedforward_cls=ValidatedFeedforwardLayer,
-        feedforward_cfg=feedforward_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=None,
-        intermediate_activation=None)
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    data_tensor = tf.keras.Input(shape=(sequence_length, width))
-    # Create a 2-dimensional input (the first dimension is implicit).
-    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
-    output_tensor = test_layer([data_tensor, mask_tensor])
-
-    # Create a model from the test layer.
-    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
-
-    # Invoke the model on test data. We can't validate the output data itself
-    # (the NN is too complex) but this will rule out structural runtime errors.
-    batch_size = 6
-    input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, width))
-    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
-    # which here is (batch, sequence_length, sequence_length)
-    mask_data = np.random.randint(
-        2, size=(batch_size, sequence_length, sequence_length))
-    pre_serialization_output = model.predict([input_data, mask_data])
-
-    serialized_data = model.get_config()
-    # Create a new model from the old config, and copy the weights. These models
-    # should have identical outputs.
-    new_model = tf.keras.Model.from_config(serialized_data)
-    new_model.set_weights(model.get_weights())
-    output = new_model.predict([input_data, mask_data])
-
-    self.assertAllClose(pre_serialization_output, output)
-
-    # If the layer was configured correctly, it should have a list attribute
-    # (since it should have the custom class and config passed to it).
-    new_model.summary()
-    new_call_list = new_model.get_layer(
-        name='transformer_scaffold')._attention_layer.list
-    self.assertNotEmpty(new_call_list)
-    self.assertTrue(new_call_list[0],
-                    "The passed layer class wasn't instantiated.")
-    new_feedforward_call_list = new_model.get_layer(
-        name='transformer_scaffold')._feedforward_block.list
-    self.assertNotEmpty(new_feedforward_call_list)
-    self.assertTrue(new_feedforward_call_list[0],
-                    "The passed layer class wasn't instantiated.")
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_test.py
deleted file mode 100644
index e9f910321e3dfde2c4d283651185b8ae27d16888..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_test.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for Keras-based transformer block layer."""
-
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.layers import transformer
-
-
-def _create_cache(batch_size, init_decode_length, num_heads, head_size):
-  return {
-      'key':
-          tf.zeros([batch_size, init_decode_length, num_heads, head_size],
-                   dtype=tf.float32),
-      'value':
-          tf.zeros([batch_size, init_decode_length, num_heads, head_size],
-                   dtype=tf.float32)
-  }
-
-
-@keras_parameterized.run_all_keras_modes
-class TransformerDecoderBlockTest(keras_parameterized.TestCase):
-
-  def test_decoder_block_with_cache(self):
-    num_attention_heads = 2
-    hidden_size = 16
-    decoder_block = transformer.TransformerDecoderBlock(
-        num_attention_heads=num_attention_heads,
-        intermediate_size=32,
-        intermediate_activation='relu',
-        dropout_rate=0.1,
-        attention_dropout_rate=0.1)
-    # Forward path.
-    dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
-    dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32)
-    inputs = [dummy_tensor, dummy_tensor, dummy_mask, dummy_mask]
-    cache = _create_cache(2, 0, num_attention_heads,
-                          hidden_size // num_attention_heads)
-    output, cache = decoder_block(inputs, cache)
-    self.assertEqual(output.shape, (2, 4, hidden_size))
-    self.assertEqual(cache['value'].shape, (2, 4, 2, 8))
-
-  def test_use_bias_norm_first(self):
-    num_attention_heads = 2
-    hidden_size = 16
-    decoder_block = transformer.TransformerDecoderBlock(
-        num_attention_heads=num_attention_heads,
-        intermediate_size=32,
-        intermediate_activation='relu',
-        dropout_rate=0.1,
-        attention_dropout_rate=0.1,
-        use_bias=False,
-        norm_first=True,
-        norm_epsilon=1e-6,
-        intermediate_dropout=0.1,
-        attention_initializer=tf.keras.initializers.RandomUniform(
-            minval=0., maxval=1.))
-    # Forward path.
-    dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
-    dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32)
-    inputs = [dummy_tensor, dummy_tensor, dummy_mask, dummy_mask]
-    output, _ = decoder_block(inputs)
-    self.assertEqual(output.shape, (2, 4, hidden_size))
-
-  def test_get_config(self):
-    num_attention_heads = 2
-    decoder_block = transformer.TransformerDecoderBlock(
-        num_attention_heads=num_attention_heads,
-        intermediate_size=32,
-        intermediate_activation='relu',
-        dropout_rate=0.1,
-        attention_dropout_rate=0.1,
-        use_bias=False,
-        norm_first=True,
-        norm_epsilon=1e-6,
-        intermediate_dropout=0.1,
-        attention_initializer=tf.keras.initializers.RandomUniform(
-            minval=0., maxval=1.))
-    decoder_block_config = decoder_block.get_config()
-    new_decoder_block = transformer.TransformerDecoderBlock.from_config(
-        decoder_block_config)
-    self.assertEqual(decoder_block_config, new_decoder_block.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_xl.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_xl.py
deleted file mode 100644
index 438ccd26127af7216ed9752bbd73003f9b9d8971..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_xl.py
+++ /dev/null
@@ -1,575 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based Transformer XL layer."""
-
-from absl import logging
-
-import tensorflow as tf
-
-from official.nlp.modeling.layers import relative_attention
-
-
-def _cache_memory(current_state, previous_state, memory_length, reuse_length=0):
-  """Caches hidden states into memory.
-
-  Args:
-    current_state: `Tensor`, the current state.
-    previous_state: `Tensor`, the previous state.
-    memory_length: `int`, the number of tokens to cache.
-    reuse_length: `int`, the number of tokens in the current batch to be cached
-      and reused in the future.
-
-  Returns:
-    A `Tensor`, representing the cached state with stopped gradients.
-
-  """
-  if memory_length is None or memory_length == 0:
-    return None
-  else:
-    if reuse_length > 0:
-      current_state = current_state[:, :reuse_length, :]
-
-    if previous_state is None:
-      new_mem = current_state[:, -memory_length:, :]
-    else:
-      new_mem = tf.concat(
-          [previous_state, current_state], 1)[:, -memory_length:, :]
-
-  return tf.stop_gradient(new_mem)
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class TransformerXLBlock(tf.keras.layers.Layer):
-  """Transformer XL block.
-
-  This implements a Transformer XL block from "Transformer-XL: Attentive
-  Language Models Beyond a Fixed-Length Context"
-  (https://arxiv.org/abs/1901.02860).
-
-  This block is further extended to allow for the Transformer-XL
-  re-parameterization in "XLNet: Generalized Autoregressive Pretraining for
-  Language Understanding" (https://arxiv.org/abs/1906.08237).
-
-  Given an input stream, this block computes attention, applies dropouts and
-  layer norms and feeds into the FFN network.
-
-  **Note: This layer is currently experimental.
-
-  Attributes:
-    vocab_size: The size of the token vocabulary.
-    hidden_size: The size of the transformer hidden layers.
-    num_attention_heads: The number of attention heads.
-    head_size: The dimension size of each attention head.
-    inner_size: The inner size for the transformer layers.
-    dropout_rate: Dropout rate for the output of this layer.
-    attention_dropout_rate: Dropout rate on attention probabilities.
-    two_stream: Whether or not to use `TwoStreamRelativeAttention` used in the
-      XLNet pretrainer. If `False`, then it will use
-      `MultiHeadRelativeAttention` as in Transformer XL.
-    norm_epsilon: Epsilon value to initialize normalization layers.
-    inner_activation: The activation to use for the inner
-      FFN layers.
-    kernel_initializer: Initializer for dense layer kernels.
-    inner_dropout: Dropout probability for the inner dropout
-      layer.
-  """
-
-  def __init__(self,
-               vocab_size,
-               hidden_size,
-               num_attention_heads,
-               head_size,
-               inner_size,
-               dropout_rate,
-               attention_dropout_rate,
-               two_stream=False,
-               norm_epsilon=1e-12,
-               inner_activation="relu",
-               kernel_initializer="variance_scaling",
-               inner_dropout=0.0,
-               **kwargs):
-    """Initializes TransformerXLBlock layer."""
-
-    super(TransformerXLBlock, self).__init__(**kwargs)
-    self._vocab_size = vocab_size
-    self._num_heads = num_attention_heads
-    self._head_size = head_size
-    self._hidden_size = hidden_size
-    self._inner_size = inner_size
-    self._dropout_rate = dropout_rate
-    self._attention_dropout_rate = attention_dropout_rate
-    self._inner_activation = inner_activation
-    self._norm_epsilon = norm_epsilon
-    self._kernel_initializer = kernel_initializer
-    self._inner_dropout = inner_dropout
-    self._two_stream = two_stream
-    if two_stream:
-      self._attention_layer_type = relative_attention.TwoStreamRelativeAttention
-    else:
-      self._attention_layer_type = relative_attention.MultiHeadRelativeAttention
-
-  def build(self, input_shape):
-    input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
-    input_tensor_shape = tf.TensorShape(input_tensor)
-    if len(input_tensor_shape.as_list()) != 3:
-      raise ValueError("TransformerLayer expects a three-dimensional input of "
-                       "shape [batch, sequence, width].")
-    batch_size, sequence_length, hidden_size = input_tensor_shape
-
-    if len(input_shape) == 2:
-      mask_tensor_shape = tf.TensorShape(input_shape[1])
-      expected_mask_tensor_shape = tf.TensorShape(
-          [batch_size, sequence_length, sequence_length])
-      if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
-        raise ValueError("When passing a mask tensor to TransformerXLBlock, "
-                         "the mask tensor must be of shape [batch, "
-                         "sequence_length, sequence_length] (here %s). Got a "
-                         "mask tensor of shape %s." %
-                         (expected_mask_tensor_shape, mask_tensor_shape))
-    if hidden_size % self._num_heads != 0:
-      raise ValueError(
-          "The input size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (hidden_size, self._num_heads))
-    self._attention_layer = self._attention_layer_type(
-        num_heads=self._num_heads,
-        key_dim=self._head_size,
-        value_dim=self._head_size,
-        dropout=self._attention_dropout_rate,
-        use_bias=False,
-        kernel_initializer=self._kernel_initializer,
-        name="rel_attn")
-    self._attention_dropout = tf.keras.layers.Dropout(
-        rate=self._attention_dropout_rate)
-    self._attention_layer_norm = tf.keras.layers.LayerNormalization(
-        name="self_attention_layer_norm",
-        axis=-1,
-        epsilon=self._norm_epsilon,
-        dtype=tf.float32)
-    self._inner_dense = tf.keras.layers.experimental.EinsumDense(
-        "abc,cd->abd",
-        output_shape=(None, self._inner_size),
-        bias_axes="d",
-        kernel_initializer=self._kernel_initializer,
-        name="inner")
-
-    self._inner_activation_layer = tf.keras.layers.Activation(
-        self._inner_activation)
-    self._inner_dropout_layer = tf.keras.layers.Dropout(
-        rate=self._inner_dropout)
-    self._output_dense = tf.keras.layers.experimental.EinsumDense(
-        "abc,cd->abd",
-        output_shape=(None, hidden_size),
-        bias_axes="d",
-        name="output",
-        kernel_initializer=self._kernel_initializer)
-    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-    self._output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm",
-        axis=-1,
-        epsilon=self._norm_epsilon)
-
-    super(TransformerXLBlock, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        "vocab_size":
-            self._vocab_size,
-        "hidden_size":
-            self._hidden_size,
-        "num_attention_heads":
-            self._num_heads,
-        "head_size":
-            self._head_size,
-        "inner_size":
-            self._inner_size,
-        "dropout_rate":
-            self._dropout_rate,
-        "attention_dropout_rate":
-            self._attention_dropout_rate,
-        "two_stream":
-            self._two_stream,
-        "norm_epsilon":
-            self._norm_epsilon,
-        "inner_activation":
-            self._inner_activation,
-        "kernel_initializer":
-            self._kernel_initializer,
-        "inner_dropout":
-            self._inner_dropout,
-    }
-    base_config = super(TransformerXLBlock, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self,
-           content_stream,
-           content_attention_bias,
-           positional_attention_bias,
-           relative_position_encoding=None,
-           segment_matrix=None,
-           segment_encoding=None,
-           segment_attention_bias=None,
-           state=None,
-           content_attention_mask=None,
-           query_stream=None,
-           query_attention_mask=None,
-           target_mapping=None):
-    """Implements `call` for the Layer.
-
-    Args:
-      content_stream: `Tensor`, the input content stream. This is the standard
-        input to Transformer XL and is commonly referred to as `h` in XLNet.
-      content_attention_bias: Bias `Tensor` for content based attention of shape
-        `[num_heads, dim]`.
-      positional_attention_bias: Bias `Tensor` for position based attention of
-        shape `[num_heads, dim]`.
-      relative_position_encoding: Relative positional encoding `Tensor` of shape
-        `[B, L, dim]`.
-      segment_matrix: Optional `Tensor` of shape `[B, S, S + M]`. Used in XLNet,
-        but not in Transformer XL.
-      segment_encoding: Optional `Tensor` of shape `[2, num_heads, dim]`. Used
-        in XLNet, but not in Transformer XL.
-      segment_attention_bias: Optional bias `Tensor` for segment based attention
-        of shape `[num_heads, dim]`.
-      state: Optional `Tensor` of shape `[B, M, E]`, where M is the length of
-        the state or memory. If passed, this is also attended over as in
-        Transformer XL.
-      content_attention_mask: Optional `Tensor` representing the mask that is
-        added to content attention logits. If state is not None, the mask source
-        sequence dimension should extend M.
-      query_stream: Optional `Tensor`, the query stream. This is introduced in
-        `TwoStreamRelativeAttention`/XLNet pretrainer. This is ignored if
-        `two_stream` is `False`.
-      query_attention_mask: Optional `Tensor` representing the mask that is
-        added to query attention logits. If state is not None, the mask source
-        sequence dimension should extend M.
-      target_mapping: Optional `Tensor` representing the target mapping when
-        calculating query attention.
-
-    Returns:
-      A `dict` object, containing the key value pairs for `content_attention`
-      and (if `two_stream` is `True`) `query_attention`.
-
-    """
-    if not self._two_stream and query_stream is not None:
-      logging.warning("`query_stream` was provided but two stream attention is "
-                      "disabled. `query_stream` will be ignored.")
-    if self._two_stream:
-      attention_kwargs = dict(
-          content_stream=content_stream,
-          query_stream=query_stream,
-          query_attention_mask=query_attention_mask,
-          target_mapping=target_mapping,
-          content_attention_mask=content_attention_mask)
-    else:
-      attention_kwargs = dict(
-          query=content_stream,
-          value=content_stream,
-          key=content_stream,
-          attention_mask=content_attention_mask)
-
-    common_attention_kwargs = dict(
-        content_attention_bias=content_attention_bias,
-        relative_position_encoding=relative_position_encoding,
-        positional_attention_bias=positional_attention_bias,
-        segment_matrix=segment_matrix,
-        segment_encoding=segment_encoding,
-        segment_attention_bias=segment_attention_bias,
-        state=state)
-
-    attention_kwargs.update(common_attention_kwargs)
-    attention_output = self._attention_layer(**attention_kwargs)
-
-    if self._two_stream:
-      attention_streams = attention_output
-      input_streams = [content_stream, query_stream]
-    else:
-      attention_streams = [attention_output]
-      input_streams = [content_stream]
-
-    attention_keys = ["content_attention", "query_attention"]
-    attention_output = {}
-    for attention_stream, input_stream, attention_key in zip(
-        attention_streams, input_streams, attention_keys):
-      attention_stream = self._attention_dropout(attention_stream)
-      attention_stream = self._attention_layer_norm(
-          attention_stream + input_stream)
-      inner_output = self._inner_dense(attention_stream)
-      inner_output = self._inner_activation_layer(
-          inner_output)
-      inner_output = self._inner_dropout_layer(
-          inner_output)
-      layer_output = self._output_dense(inner_output)
-      layer_output = self._output_dropout(layer_output)
-      layer_output = self._output_layer_norm(layer_output + attention_stream)
-      attention_output[attention_key] = layer_output
-
-    return attention_output
-
-
-class TransformerXL(tf.keras.layers.Layer):
-  """Transformer XL.
-
-  This layer combines multiple Transformer XL blocks from "Transformer-XL:
-  Attentive Language Models Beyond a Fixed-Length Context"
-  (https://arxiv.org/abs/1901.02860).
-
-  This layer handles the attention biases as well as memory caching and reuse
-  as in Transformer XL and XLNet.
-
-
-  Attributes:
-    vocab_size: The number of tokens in vocabulary.
-    num_layers: The number of layers.
-    hidden_size: The hidden size.
-    num_attention_heads: The number of attention heads.
-    head_size: The dimension size of each attention head.
-    inner_size: The hidden size in feed-forward layers.
-    dropout_rate: Dropout rate used in each Transformer XL block.
-    attention_dropout_rate: Dropout rate on attention probabilities.
-    two_stream: Whether or not to use `TwoStreamRelativeAttention` used
-      in the XLNet pretrainer. If `False`, then it will use
-      `MultiHeadRelativeAttention` as in Transformer XL.
-    initializer: The initializer to use for attention biases.
-    tie_attention_biases: Whether or not to tie biases together. If `True`, then
-      each Transformer XL block shares the same trainable attention bias. If
-      `False`, then each block has its own attention bias. This is usually set
-      to `True`.
-    memory_length: The number of tokens to cache.
-    reuse_length: The number of tokens in the current batch to be cached
-      and reused in the future.
-    inner_activation: The activation to use in the inner layers
-     for Transformer XL blocks. Typically "relu" or "gelu".
-  """
-
-  def __init__(self,
-               vocab_size,
-               num_layers,
-               hidden_size,
-               num_attention_heads,
-               head_size,
-               inner_size,
-               dropout_rate,
-               attention_dropout_rate,
-               initializer,
-               two_stream=False,
-               tie_attention_biases=True,
-               memory_length=None,
-               reuse_length=None,
-               inner_activation="relu",
-               **kwargs):
-    """Initializes TransformerXL."""
-    super(TransformerXL, self).__init__(**kwargs)
-
-    self._vocab_size = vocab_size
-    self._initializer = initializer
-    self._num_layers = num_layers
-    self._hidden_size = hidden_size
-    self._num_attention_heads = num_attention_heads
-    self._head_size = head_size
-    self._inner_size = inner_size
-    self._inner_activation = inner_activation
-    self._dropout_rate = dropout_rate
-    self._attention_dropout_rate = attention_dropout_rate
-    self._tie_attention_biases = tie_attention_biases
-    self._two_stream = two_stream
-
-    self._memory_length = memory_length
-    self._reuse_length = reuse_length
-
-    if self._tie_attention_biases:
-      attention_bias_shape = [self._num_attention_heads, self._head_size]
-    else:
-      attention_bias_shape = [self._num_layers, self._num_attention_heads,
-                              self._head_size]
-
-    self.content_attention_bias = self.add_weight(
-        "content_attention_bias",
-        shape=attention_bias_shape,
-        dtype=tf.float32,
-        initializer=self._initializer)
-    self.positional_attention_bias = self.add_weight(
-        "positional_attention_bias",
-        shape=attention_bias_shape,
-        dtype=tf.float32,
-        initializer=self._initializer)
-    self.segment_attention_bias = self.add_weight(
-        "segment_attention_bias",
-        shape=attention_bias_shape,
-        dtype=tf.float32,
-        initializer=self._initializer)
-
-    self.transformer_xl_layers = []
-    for i in range(self._num_layers):
-      self.transformer_xl_layers.append(
-          TransformerXLBlock(
-              vocab_size=self._vocab_size,
-              hidden_size=self._head_size * self._num_attention_heads,
-              num_attention_heads=self._num_attention_heads,
-              head_size=self._head_size,
-              inner_size=self._inner_size,
-              dropout_rate=self._dropout_rate,
-              attention_dropout_rate=self._attention_dropout_rate,
-              norm_epsilon=1e-12,
-              inner_activation=self._inner_activation,
-              two_stream=self._two_stream,
-              kernel_initializer="variance_scaling",
-              name="layer_%d" % i))
-
-    self.output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-
-  def get_config(self):
-    config = {
-        "vocab_size":
-            self._vocab_size,
-        "num_layers":
-            self._num_layers,
-        "hidden_size":
-            self._hidden_size,
-        "num_attention_heads":
-            self._num_attention_heads,
-        "head_size":
-            self._head_size,
-        "inner_size":
-            self._inner_size,
-        "dropout_rate":
-            self._dropout_rate,
-        "attention_dropout_rate":
-            self._attention_dropout_rate,
-        "initializer":
-            self._initializer,
-        "two_stream":
-            self._two_stream,
-        "tie_attention_biases":
-            self._tie_attention_biases,
-        "memory_length":
-            self._memory_length,
-        "reuse_length":
-            self._reuse_length,
-        "inner_activation":
-            self._inner_activation,
-    }
-    base_config = super(TransformerXL, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self,
-           content_stream,
-           relative_position_encoding,
-           segment_matrix=None,
-           segment_embedding=None,
-           state=None,
-           content_attention_mask=None,
-           query_stream=None,
-           query_attention_mask=None,
-           target_mapping=None):
-    """Implements call() for the layer.
-
-    Args:
-      content_stream: `Tensor`, the input content stream. This is the standard
-        input to Transformer XL and is commonly referred to as `h` in XLNet.
-      relative_position_encoding: Relative positional encoding `Tensor` of shape
-        `[B, L, dim]`.
-      segment_matrix: Optional `Tensor` of shape `[B, S, S + M]`. Used in XLNet,
-        but not in Transformer XL.
-      segment_embedding: Optional `Tensor` of shape `[2, num_heads, dim]`. Used
-        in XLNet, but not in Transformer XL.
-      state: Optional `Tensor` of shape `[B, M, E]`, where M is the length of
-        the state or memory. If passed, this is also attended over as in
-        Transformer XL.
-      content_attention_mask: Optional `Tensor` representing the mask that is
-        added to content attention logits. If state is not None, the mask source
-        sequence dimension should extend M.
-      query_stream: Optional `Tensor`, the query stream. This is introduced in
-        `TwoStreamRelativeAttention`/XLNet pretrainer. This is ignored if
-        `two_stream` is `False`.
-      query_attention_mask: Optional `Tensor` representing the mask that is
-        added to query attention logits. If state is not None, the mask source
-        sequence dimension should extend M.
-      target_mapping: Optional `Tensor` representing the target mapping when
-        calculating query attention.
-
-    Returns:
-      A tuple consisting of the attention output and the list of cached memory
-      states.
-      The attention output is `content_attention` if `two_stream` is `False`,
-      otherwise it is `query_attention`.
-    """
-    new_mems = []
-
-    if state is None:
-      state = [None] * self._num_layers
-    for i in range(self._num_layers):
-      # cache new mems
-      new_mems.append(
-          _cache_memory(content_stream, state[i],
-                        self._memory_length, self._reuse_length))
-
-      # segment bias
-      if segment_matrix is None:
-        segment_attention_bias = None
-        segment_encoding = None
-      else:
-        segment_attention_bias = (self.segment_attention_bias
-                                  if self._tie_attention_biases
-                                  else self.segment_attention_bias[i])
-        segment_encoding = segment_embedding[i]
-
-      content_attention_bias = (self.content_attention_bias
-                                if self._tie_attention_biases
-                                else self.content_attention_bias[i])
-      positional_attention_bias = (self.positional_attention_bias
-                                   if self._tie_attention_biases
-                                   else self.positional_attention_bias[i])
-      transformer_xl_layer = self.transformer_xl_layers[i]
-      transformer_xl_output = transformer_xl_layer(
-          content_stream=content_stream,
-          content_attention_bias=content_attention_bias,
-          positional_attention_bias=positional_attention_bias,
-          relative_position_encoding=relative_position_encoding,
-          segment_matrix=segment_matrix,
-          segment_encoding=segment_encoding,
-          segment_attention_bias=segment_attention_bias,
-          state=state[i],
-          content_attention_mask=content_attention_mask,
-          query_attention_mask=query_attention_mask,
-          query_stream=query_stream,
-          target_mapping=target_mapping)
-      content_stream = transformer_xl_output["content_attention"]
-      if self._two_stream:
-        query_stream = transformer_xl_output["query_attention"]
-      else:
-        query_stream = None
-
-    if self._two_stream:
-      output_stream = query_stream
-    else:
-      output_stream = content_stream
-
-    return output_stream, new_mems
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_xl_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_xl_test.py
deleted file mode 100644
index 0f0e6bc52ace50383fd62815b304262ba377bb17..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/transformer_xl_test.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for Transformer XL."""
-
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-
-from official.nlp.modeling.layers import transformer_xl
-
-
-def create_mock_transformer_xl_data(
-    batch_size,
-    num_heads,
-    head_size,
-    hidden_size,
-    seq_length,
-    memory_length=0,
-    num_predictions=2,
-    two_stream=False,
-    num_layers=1,
-    include_biases=True,
-    include_state=False,
-    include_mask=False,
-    include_segment=False):
-  """Creates mock testing data.
-
-  Args:
-    batch_size: `int`, the batch size.
-    num_heads: `int`, number of attention heads.
-    head_size: `int`, the size of each attention head.
-    hidden_size: `int`, the layer's hidden size.
-    seq_length: `int`, Sequence length of the input.
-    memory_length: optional `int`, the length of the state. Defaults to 0.
-    num_predictions: `int`, the number of predictions used in two stream
-      attention.
-    two_stream: `bool`, whether or not to generate two stream data.
-    num_layers: `int`, the number of Transformer XL blocks.
-    include_biases: optional `bool`, whether or not to include attention biases.
-    include_state: optional `bool`, whether or not to include state data.
-    include_mask: optional `bool`, whether or not to include mask data.
-    include_segment: optional `bool`, whether or not to include segment data.
-
-  Returns:
-    A dictionary with `str` as keys and `Tensor` as values.
-  """
-  encoding_shape = (batch_size, seq_length * 2, hidden_size)
-
-  data = dict(
-      relative_position_encoding=tf.random.normal(shape=encoding_shape),
-      content_stream=tf.random.normal(
-          shape=(batch_size, seq_length, hidden_size)))
-
-  if include_biases:
-    attention_bias_shape = (num_heads, head_size)
-    data.update(dict(
-        content_attention_bias=tf.random.normal(shape=attention_bias_shape),
-        segment_attention_bias=tf.random.normal(shape=attention_bias_shape),
-        positional_attention_bias=tf.random.normal(shape=attention_bias_shape)))
-
-  if two_stream:
-    data.update(dict(
-        query_stream=tf.random.normal(
-            shape=(batch_size, num_predictions, hidden_size)),
-        target_mapping=tf.random.normal(
-            shape=(batch_size, num_predictions, seq_length))))
-
-  if include_state:
-    total_seq_length = seq_length + memory_length
-    if num_layers > 1:
-      state_shape = (num_layers, batch_size, memory_length, hidden_size)
-    else:
-      state_shape = (batch_size, memory_length, hidden_size)
-    data.update(dict(
-        state=tf.random.normal(shape=state_shape)))
-  else:
-    total_seq_length = seq_length
-
-  if include_mask:
-    mask_shape = (batch_size, num_heads, seq_length, total_seq_length)
-    mask_data = np.random.randint(2, size=mask_shape).astype("float32")
-    data["content_attention_mask"] = mask_data
-    if two_stream:
-      data["query_attention_mask"] = mask_data
-
-  if include_segment:
-    # A transformer XL block takes an individual segment "encoding" from the
-    # entirety of the Transformer XL segment "embedding".
-    if num_layers > 1:
-      segment_encoding_shape = (num_layers, 2, num_heads, head_size)
-      segment_encoding_name = "segment_embedding"
-    else:
-      segment_encoding_shape = (2, num_heads, head_size)
-      segment_encoding_name = "segment_encoding"
-
-    segment_matrix = np.random.randint(
-        2, size=(batch_size, seq_length, total_seq_length))
-    data["segment_matrix"] = tf.math.equal(segment_matrix, 1)
-    data[segment_encoding_name] = tf.random.normal(shape=segment_encoding_shape)
-
-  return data
-
-
-@keras_parameterized.run_all_keras_modes
-class TransformerXLBlockTest(keras_parameterized.TestCase):
-
-  @combinations.generate(combinations.combine(
-      memory_length=[0, 4],
-      two_stream=[True, False],
-      state=[True, False],
-      mask=[True, False],
-      segment=[True, False]))
-  def test_transformer_xl_block(
-      self,
-      two_stream,
-      memory_length,
-      state,
-      mask,
-      segment):
-    """Tests combinations of Transformer XL block calculations."""
-    batch_size, num_heads, head_size, seq_length = 2, 12, 64, 8
-    hidden_size, num_predictions, inner_size = 24, 8, 12
-
-    data = create_mock_transformer_xl_data(
-        include_biases=True,
-        num_heads=num_heads,
-        head_size=head_size,
-        hidden_size=hidden_size,
-        seq_length=seq_length,
-        batch_size=batch_size,
-        memory_length=memory_length,
-        num_predictions=num_predictions,
-        two_stream=two_stream,
-        include_state=state,
-        include_mask=mask,
-        include_segment=segment)
-
-    test_layer = transformer_xl.TransformerXLBlock(
-        vocab_size=32000,
-        hidden_size=hidden_size,
-        num_attention_heads=num_heads,
-        head_size=head_size,
-        inner_size=inner_size,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        two_stream=two_stream)
-    output = test_layer(**data)
-    content_attention = output["content_attention"]
-    self.assertEqual(content_attention.shape,
-                     [batch_size, seq_length, hidden_size])
-
-    if two_stream:
-      self.assertIn("query_attention", output)
-      self.assertEqual(output["query_attention"].shape,
-                       [batch_size, num_predictions, hidden_size])
-    else:
-      self.assertNotIn("query_attention", output)
-
-  def test_get_config(self):
-    transformer_xl_block = transformer_xl.TransformerXLBlock(
-        vocab_size=32000,
-        head_size=64,
-        num_attention_heads=2,
-        hidden_size=10,
-        inner_size=50,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        two_stream=False)
-    transformer_xl_block_config = transformer_xl_block.get_config()
-    new_block = transformer_xl.TransformerXLBlock.from_config(
-        transformer_xl_block_config)
-    self.assertEqual(transformer_xl_block_config, new_block.get_config())
-
-
-@keras_parameterized.run_all_keras_modes
-class TransformerXLTest(keras_parameterized.TestCase):
-
-  @combinations.generate(combinations.combine(
-      two_stream=[True, False],
-      memory_length=[0, 4],
-      reuse_length=[0, 4],
-      tie_attention_biases=[True, False],
-      state=[True, False],
-      mask=[True, False],
-      segment=[True, False]))
-  def test_transformer_xl(
-      self,
-      two_stream,
-      memory_length,
-      reuse_length,
-      tie_attention_biases,
-      state,
-      mask,
-      segment):
-    batch_size, num_heads, head_size, seq_length = 2, 12, 64, 8
-    hidden_size, num_predictions, inner_size = 24, 8, 12
-    num_layers = 3
-
-    data = create_mock_transformer_xl_data(
-        include_biases=False,
-        num_heads=num_heads,
-        head_size=head_size,
-        hidden_size=hidden_size,
-        seq_length=seq_length,
-        batch_size=batch_size,
-        memory_length=memory_length,
-        num_predictions=num_predictions,
-        two_stream=two_stream,
-        num_layers=num_layers,
-        include_state=state,
-        include_mask=mask,
-        include_segment=segment)
-    transformer_xl_layer = transformer_xl.TransformerXL(
-        vocab_size=32000,
-        num_layers=num_layers,
-        head_size=head_size,
-        hidden_size=hidden_size,
-        num_attention_heads=num_heads,
-        inner_size=inner_size,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        two_stream=two_stream,
-        tie_attention_biases=tie_attention_biases,
-        memory_length=memory_length,
-        reuse_length=reuse_length,
-        inner_activation="relu")
-    attention_output, cached_memory_states = transformer_xl_layer(**data)
-    if two_stream:
-      self.assertEqual(attention_output.shape,
-                       [batch_size, num_predictions, hidden_size])
-    else:
-      self.assertEqual(attention_output.shape,
-                       [batch_size, seq_length, hidden_size])
-    self.assertEqual(len(cached_memory_states), num_layers)
-
-  def test_get_config(self):
-    transformer_xl_layer = transformer_xl.TransformerXL(
-        vocab_size=32000,
-        num_layers=12,
-        hidden_size=36,
-        head_size=12,
-        num_attention_heads=12,
-        inner_size=12,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        two_stream=False,
-        tie_attention_biases=True,
-        memory_length=0,
-        reuse_length=0,
-        inner_activation="relu")
-    transformer_xl_config = transformer_xl_layer.get_config()
-    new_transformer_xl = transformer_xl.TransformerXL.from_config(
-        transformer_xl_config)
-    self.assertEqual(transformer_xl_config, new_transformer_xl.get_config())
-
-
-if __name__ == "__main__":
-  np.random.seed(0)
-  tf.random.set_seed(0)
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/util.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/util.py
deleted file mode 100644
index 21269af3c9fddd388452c5c542a5009d0991fdfd..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/layers/util.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based transformer block layer."""
-
-import functools
-
-import tensorflow as tf
-
-
-class TfFunctionIfEagerDecorator(object):
-  """Helper decorator function to optionally apply the @tf.function annotation."""
-
-  def __init__(self, **kwargs):
-    self.func_kwargs = kwargs
-
-  def __call__(self, func):
-
-    @functools.wraps(func)
-    def wrapped_func(*args):
-      # TODO(b/150147476, b/150024785): Fix tf.function in TF1 crash.
-      if not hasattr(tf.compat.v1, "executing_eagerly_outside_functions"
-                    ) or tf.compat.v1.executing_eagerly_outside_functions():
-        return tf.function(func=func, **self.func_kwargs)(*args)
-      return func(*args)
-
-    # Cache the created function in self._call_impl.
-    if not hasattr(self, "_call_impl"):
-      self._call_impl = wrapped_func
-    return self._call_impl
-
-
-def tf_function_if_eager(**kwargs):
-  """Applies the @tf.function decorator only if running in eager mode."""
-  return TfFunctionIfEagerDecorator(**kwargs)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/README.md
deleted file mode 100644
index a2607b1dab7e2cb1c054855ad3999b9036716c9a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Losses
-
-Losses contains common loss computation used in NLP tasks.
-
-* `weighted_sparse_categorical_crossentropy_loss` computes per-batch sparse
-categorical crossentropy loss.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/__init__.py
deleted file mode 100644
index e30e592d85fafd84bda2bb63e2277ab38918a7d5..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Losses contains common loss computation used in NLP (subject to change)."""
-from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import loss as weighted_sparse_categorical_crossentropy_loss
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
deleted file mode 100644
index e59c68b1742646e28c9eba804e7c3ab7a3c82fce..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Weighted sparse categorical cross-entropy losses."""
-
-import tensorflow as tf
-
-
-def _adjust_labels(labels, predictions):
-  """Adjust the 'labels' tensor by squeezing it if needed."""
-  labels = tf.cast(labels, tf.int32)
-  if len(predictions.shape) == len(labels.shape):
-    labels = tf.squeeze(labels, [-1])
-  return labels, predictions
-
-
-def _validate_rank(labels, predictions, weights):
-  if weights is not None and len(weights.shape) != len(labels.shape):
-    raise RuntimeError(
-        ("Weight and label tensors were not of the same rank. weights.shape "
-         "was %s, and labels.shape was %s.") %
-        (predictions.shape, labels.shape))
-  if (len(predictions.shape) - 1) != len(labels.shape):
-    raise RuntimeError(
-        ("Weighted sparse categorical crossentropy expects `labels` to have a "
-         "rank of one less than `predictions`. labels.shape was %s, and "
-         "predictions.shape was %s.") % (labels.shape, predictions.shape))
-
-
-def loss(labels, predictions, weights=None, from_logits=False):
-  """Calculate a per-batch sparse categorical crossentropy loss.
-
-  This loss function assumes that the predictions are post-softmax.
-  Args:
-    labels: The labels to evaluate against. Should be a set of integer indices
-      ranging from 0 to (vocab_size-1).
-    predictions: The network predictions. Should have softmax already applied.
-    weights: An optional weight array of the same shape as the 'labels' array.
-      If None, all examples will be used.
-    from_logits: Whether the input predictions are logits.
-
-  Returns:
-    A loss scalar.
-
-  Raises:
-    RuntimeError if the passed tensors do not have the same rank.
-  """
-  # When using these functions with the Keras core API, we will need to squeeze
-  # the labels tensor - Keras adds a spurious inner dimension.
-  labels, predictions = _adjust_labels(labels, predictions)
-  _validate_rank(labels, predictions, weights)
-
-  example_losses = tf.keras.losses.sparse_categorical_crossentropy(
-      labels, predictions, from_logits=from_logits)
-
-  if weights is None:
-    return tf.reduce_mean(example_losses)
-  weights = tf.cast(weights, predictions.dtype)
-  return tf.math.divide_no_nan(
-      tf.reduce_sum(example_losses * weights), tf.reduce_sum(weights))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
deleted file mode 100644
index 5e76e716d098d73eea12564f6b9ed35f6ebfd570..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for masked LM loss."""
-import numpy as np
-
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling import layers
-from official.nlp.modeling import networks
-from official.nlp.modeling.losses import weighted_sparse_categorical_crossentropy
-
-
-@keras_parameterized.run_all_keras_modes
-class ClassificationLossTest(keras_parameterized.TestCase):
-
-  def create_lm_model(self,
-                      vocab_size,
-                      sequence_length,
-                      hidden_size,
-                      num_predictions,
-                      output="predictions"):
-    # First, create a transformer stack that we can use to get the LM's
-    # vocabulary weight.
-    xformer_stack = networks.BertEncoder(
-        vocab_size=vocab_size,
-        num_layers=1,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_attention_heads=4,
-    )
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    _ = xformer_stack([word_ids, mask, type_ids])
-
-    # Create a maskedLM from the transformer stack.
-    test_layer = layers.MaskedLM(
-        embedding_table=xformer_stack.get_embedding_table(), output=output)
-
-    # Create a model from the masked LM layer.
-    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_lm_positions = tf.keras.Input(
-        shape=(num_predictions,), dtype=tf.int32)
-    output = test_layer(lm_input_tensor, masked_positions=masked_lm_positions)
-    return tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
-
-  def test_loss_3d_input(self):
-    """Test overall loss with a 3-dimensional input, from a masked LM."""
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    num_predictions = 21
-    model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
-
-    # Get the output of the masked LM.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
-    output_data = model.predict([lm_input_data, masked_position_data])
-
-    # Calculate loss.
-    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
-    weights = np.random.randint(2, size=(batch_size, num_predictions))
-    per_example_loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=weights)
-
-    # Total loss data should have one value, and that value shouldn't be zero
-    # in this case (as we're using random data).
-    expected_shape = []  # Scalar
-    self.assertEqual(expected_shape, per_example_loss_data.shape.as_list())
-    self.assertNotAllClose(
-        tf.zeros_like(per_example_loss_data), per_example_loss_data)
-
-  def test_loss_weights_3d_input(self):
-    """Test masked loss with a 3-dimensional input, from a masked LM."""
-    vocab_size = 100
-    sequence_length = 32
-    hidden_size = 64
-    num_predictions = 21
-    model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
-
-    # Get the output of the masked LM.
-    batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
-    output_data = model.predict([lm_input_data, masked_position_data])
-
-    # Calculate a fully masked weight tensor. This should give a loss of zero.
-    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
-    null_weights = np.zeros((batch_size, num_predictions))
-    weighted_loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=null_weights)
-
-    # Because the tensor is fully masked, the loss should be 0.
-    self.assertAllClose(0, weighted_loss_data)
-
-  def test_mismatched_predictions_and_labels_ranks_squeezes(self):
-    """Test that the loss asserts when rank(predictions)-1 != rank(labels)."""
-    batch_size = 3
-    output_data = np.random.random_sample((batch_size, 10))
-    labels = np.random.randint(10, size=(batch_size, 1))
-
-    # All that this test tests is that the squeeze is successful.
-    _ = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels)
-
-  def test_mismatched_weights_and_labels_ranks_fail(self):
-    """Test that the loss asserts when rank(predictions) != rank(labels)."""
-    batch_size = 3
-    output_data = np.random.random_sample((batch_size, 10, 15))
-    labels = np.random.randint(10, size=(batch_size, 10))
-    weights = np.random.randint(2, size=(batch_size))
-
-    with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
-      _ = weighted_sparse_categorical_crossentropy.loss(
-          predictions=output_data, labels=labels, weights=weights)
-
-  def test_tf_tensor_inputs(self):
-    """Test that tf.Tensors can be used as inputs to the loss function."""
-    batch_size = 3
-    output_data = tf.convert_to_tensor(
-        np.random.random_sample((batch_size, 10, 15)))
-    labels = tf.convert_to_tensor(np.random.randint(10, size=(batch_size, 10)))
-    weights = tf.convert_to_tensor(np.random.randint(2, size=(batch_size, 10)))
-
-    # We're not trying to validate numerical correctness, just ensure that
-    # we can in fact pass tensors to these functions without causing runtime
-    # errors from the shape checking code.
-    _ = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=weights)
-
-  def test_legacy_lm_loss_compatibility(self):
-    """Test to validate computational correctness during refactors."""
-    # This is the empirical output of a masked LM with the following parameters:
-    #   batch_size = 3
-    #   vocab_size = 5
-    #   sequence_length = 4
-    #   num_predictions = 2
-    output_data = np.array(
-        [[[-2.5286622, -1.0963473, -1.4925185, -2.4451098, -1.2923571],
-          [-2.7117882, -1.1205841, -4.02187, -0.9966936, -1.5119683]],
-         [[-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741],
-          [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741]],
-         [[-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509],
-          [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509]]])
-    labels = np.array([[4, 0], [2, 2], [2, 1]])
-
-    # Validate that overall loss calculations are the same.
-    weights = np.array([[1, 0], [0, 0], [0, 0]])
-    loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data,
-        labels=labels,
-        weights=weights,
-        from_logits=True)
-    expected_loss_data = 1.2923441
-    self.assertAllClose(expected_loss_data, loss_data, rtol=1e-3)
-
-  def test_legacy_classification_loss_compatibility(self):
-    """Test to validate computational correctness during refactors."""
-    # This is the empirical output of a classifier with the following params:
-    #   batch_size = 2
-    #   num_classes = 3
-    output_data = np.array([[-1.6094601e-03, -1.0966038e+01, -6.4434357e+00],
-                            [-1.6975292e-03, -6.4009643e+00, -1.0226612e+01]])
-    labels = np.array([2, 1])
-
-    # Validate that overall loss calculations are the same.
-    weights = None
-    loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data,
-        labels=labels,
-        weights=weights,
-        from_logits=True)
-    expected_loss_data = 6.4222
-    self.assertAllClose(expected_loss_data, loss_data, rtol=1e-3)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/README.md
deleted file mode 100644
index 22fd8193c29135588b7a227a2422fe66199f59d8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# Models
-
-Models are combinations of `tf.keras` layers and models that can be trained.
-
-Several pre-built canned models are provided to train encoder networks.
-These models are intended as both convenience functions and canonical examples.
-
-* [`BertClassifier`](bert_classifier.py) implements a simple classification
-model containing a single classification head using the Classification network.
-It can be used as a regression model as well.
-
-* [`BertTokenClassifier`](bert_token_classifier.py) implements a simple token
-classification model containing a single classification head over the sequence
-output embeddings.
-
-* [`BertSpanLabeler`](bert_span_labeler.py) implementats a simple single-span
-start-end predictor (that is, a model that predicts two values: a start token
-index and an end token index), suitable for SQuAD-style tasks.
-
-* [`BertPretrainer`](bert_pretrainer.py) implements a masked LM and a
-classification head using the Masked LM and Classification networks,
-respectively.
-
-* [`DualEncoder`](dual_encoder.py) implements a dual encoder model, suitbale for
-retrieval tasks.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/__init__.py
deleted file mode 100644
index b7fb728f7affae9216880421724a22fae546aa75..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/__init__.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Models are combinations of `tf.keras` layers and models that can be trained.
-
-Several pre-built canned models are provided to train encoder networks.
-These models are intended as both convenience functions and canonical examples.
-"""
-from official.nlp.modeling.models.bert_classifier import BertClassifier
-from official.nlp.modeling.models.bert_pretrainer import *
-from official.nlp.modeling.models.bert_span_labeler import BertSpanLabeler
-from official.nlp.modeling.models.bert_token_classifier import BertTokenClassifier
-from official.nlp.modeling.models.dual_encoder import DualEncoder
-from official.nlp.modeling.models.electra_pretrainer import ElectraPretrainer
-from official.nlp.modeling.models.seq2seq_transformer import *
-from official.nlp.modeling.models.xlnet import XLNetClassifier
-from official.nlp.modeling.models.xlnet import XLNetPretrainer
-from official.nlp.modeling.models.xlnet import XLNetSpanLabeler
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_classifier.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_classifier.py
deleted file mode 100644
index a7fbfb4ae92edcab59092cbb9211482ec4462e6d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_classifier.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""BERT cls-token classifier."""
-# pylint: disable=g-classes-have-attributes
-import collections
-import tensorflow as tf
-
-from official.nlp.modeling import layers
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class BertClassifier(tf.keras.Model):
-  """Classifier model based on a BERT-style transformer-based encoder.
-
-  This is an implementation of the network structure surrounding a transformer
-  encoder as described in "BERT: Pre-training of Deep Bidirectional Transformers
-  for Language Understanding" (https://arxiv.org/abs/1810.04805).
-
-  The BertClassifier allows a user to pass in a transformer stack, and
-  instantiates a classification network based on the passed `num_classes`
-  argument. If `num_classes` is set to 1, a regression network is instantiated.
-
-  *Note* that the model is constructed by
-  [Keras Functional API](https://keras.io/guides/functional_api/).
-
-  Args:
-    network: A transformer network. This network should output a sequence output
-      and a classification output. Furthermore, it should expose its embedding
-      table via a "get_embedding_table" method.
-    num_classes: Number of classes to predict from the classification network.
-    initializer: The initializer (if any) to use in the classification networks.
-      Defaults to a Glorot uniform initializer.
-    dropout_rate: The dropout probability of the cls head.
-    use_encoder_pooler: Whether to use the pooler layer pre-defined inside the
-      encoder.
-    cls_head: (Optional) The layer instance to use for the classifier head.
-      It should take in the output from network and produce the final logits.
-      If set, the arguments ('num_classes', 'initializer', 'dropout_rate',
-      'use_encoder_pooler') will be ignored.
-  """
-
-  def __init__(self,
-               network,
-               num_classes,
-               initializer='glorot_uniform',
-               dropout_rate=0.1,
-               use_encoder_pooler=True,
-               cls_head=None,
-               **kwargs):
-    self.num_classes = num_classes
-    self.initializer = initializer
-    self.use_encoder_pooler = use_encoder_pooler
-
-    # We want to use the inputs of the passed network as the inputs to this
-    # Model. To do this, we need to keep a handle to the network inputs for use
-    # when we construct the Model object at the end of init.
-    inputs = network.inputs
-
-    if use_encoder_pooler:
-      # Because we have a copy of inputs to create this Model object, we can
-      # invoke the Network object with its own input tensors to start the Model.
-      outputs = network(inputs)
-      if isinstance(outputs, list):
-        cls_inputs = outputs[1]
-      else:
-        cls_inputs = outputs['pooled_output']
-      cls_inputs = tf.keras.layers.Dropout(rate=dropout_rate)(cls_inputs)
-    else:
-      outputs = network(inputs)
-      if isinstance(outputs, list):
-        cls_inputs = outputs[0]
-      else:
-        cls_inputs = outputs['sequence_output']
-
-    if cls_head:
-      classifier = cls_head
-    else:
-      classifier = layers.ClassificationHead(
-          inner_dim=0 if use_encoder_pooler else cls_inputs.shape[-1],
-          num_classes=num_classes,
-          initializer=initializer,
-          dropout_rate=dropout_rate,
-          name='sentence_prediction')
-
-    predictions = classifier(cls_inputs)
-
-    # b/164516224
-    # Once we've created the network using the Functional API, we call
-    # super().__init__ as though we were invoking the Functional API Model
-    # constructor, resulting in this object having all the properties of a model
-    # created using the Functional API. Once super().__init__ is called, we
-    # can assign attributes to `self` - note that all `self` assignments are
-    # below this line.
-    super(BertClassifier, self).__init__(
-        inputs=inputs, outputs=predictions, **kwargs)
-    self._network = network
-    self._cls_head = cls_head
-
-    config_dict = self._make_config_dict()
-    # We are storing the config dict as a namedtuple here to ensure checkpoint
-    # compatibility with an earlier version of this model which did not track
-    # the config dict attribute. TF does not track immutable attrs which
-    # do not contain Trackables, so by creating a config namedtuple instead of
-    # a dict we avoid tracking it.
-    config_cls = collections.namedtuple('Config', config_dict.keys())
-    self._config = config_cls(**config_dict)
-    self.classifier = classifier
-
-  @property
-  def checkpoint_items(self):
-    items = dict(encoder=self._network)
-    if hasattr(self.classifier, 'checkpoint_items'):
-      for key, item in self.classifier.checkpoint_items.items():
-        items['.'.join([self.classifier.name, key])] = item
-    return items
-
-  def get_config(self):
-    return dict(self._config._asdict())
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-  def _make_config_dict(self):
-    return {
-        'network': self._network,
-        'num_classes': self.num_classes,
-        'initializer': self.initializer,
-        'use_encoder_pooler': self.use_encoder_pooler,
-        'cls_head': self._cls_head,
-    }
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_classifier_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_classifier_test.py
deleted file mode 100644
index 68efb7a5d0a182d8f2f7fa29e2a1639e28e36dde..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_classifier_test.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for BERT trainer network."""
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling import layers
-from official.nlp.modeling import networks
-from official.nlp.modeling.models import bert_classifier
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class BertClassifierTest(keras_parameterized.TestCase):
-
-  @parameterized.named_parameters(('single_cls', 1, False), ('3_cls', 3, False),
-                                  ('3_cls_dictoutputs', 3, True))
-  def test_bert_trainer(self, num_classes, dict_outputs):
-    """Validate that the Keras object can be created."""
-    # Build a transformer network to use within the BERT trainer.
-    vocab_size = 100
-    sequence_length = 512
-    test_network = networks.BertEncoder(
-        vocab_size=vocab_size, num_layers=2, dict_outputs=dict_outputs)
-
-    # Create a BERT trainer with the created network.
-    bert_trainer_model = bert_classifier.BertClassifier(
-        test_network, num_classes=num_classes)
-
-    # Create a set of 2-dimensional inputs (the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-
-    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    cls_outs = bert_trainer_model([word_ids, mask, type_ids])
-
-    # Validate that the outputs are of the expected shape.
-    expected_classification_shape = [None, num_classes]
-    self.assertAllEqual(expected_classification_shape, cls_outs.shape.as_list())
-
-  @parameterized.named_parameters(
-      ('single_cls', 1, False),
-      ('2_cls', 2, False),
-      ('single_cls_custom_head', 1, True),
-      ('2_cls_custom_head', 2, True))
-  def test_bert_trainer_tensor_call(self, num_classes, use_custom_head):
-    """Validate that the Keras object can be invoked."""
-    # Build a transformer network to use within the BERT trainer. (Here, we use
-    # a short sequence_length for convenience.)
-    test_network = networks.BertEncoder(vocab_size=100, num_layers=2)
-    cls_head = layers.GaussianProcessClassificationHead(
-        inner_dim=0, num_classes=num_classes) if use_custom_head else None
-
-    # Create a BERT trainer with the created network.
-    bert_trainer_model = bert_classifier.BertClassifier(
-        test_network, num_classes=num_classes, cls_head=cls_head)
-
-    # Create a set of 2-dimensional data tensors to feed into the model.
-    word_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
-    mask = tf.constant([[1, 1], [1, 0]], dtype=tf.int32)
-    type_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
-
-    # Invoke the trainer model on the tensors. In Eager mode, this does the
-    # actual calculation. (We can't validate the outputs, since the network is
-    # too complex: this simply ensures we're not hitting runtime errors.)
-    _ = bert_trainer_model([word_ids, mask, type_ids])
-
-  @parameterized.named_parameters(
-      ('default_cls_head', None),
-      ('sngp_cls_head', layers.GaussianProcessClassificationHead(
-          inner_dim=0, num_classes=4)))
-  def test_serialize_deserialize(self, cls_head):
-    """Validate that the BERT trainer can be serialized and deserialized."""
-    # Build a transformer network to use within the BERT trainer. (Here, we use
-    # a short sequence_length for convenience.)
-    test_network = networks.BertEncoder(
-        vocab_size=100, num_layers=2, sequence_length=5)
-
-    # Create a BERT trainer with the created network. (Note that all the args
-    # are different, so we can catch any serialization mismatches.)
-    bert_trainer_model = bert_classifier.BertClassifier(
-        test_network, num_classes=4, initializer='zeros', cls_head=cls_head)
-
-    # Create another BERT trainer via serialization and deserialization.
-    config = bert_trainer_model.get_config()
-    new_bert_trainer_model = bert_classifier.BertClassifier.from_config(config)
-
-    # Validate that the config can be forced to JSON.
-    _ = new_bert_trainer_model.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(bert_trainer_model.get_config(),
-                        new_bert_trainer_model.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_pretrainer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_pretrainer.py
deleted file mode 100644
index da03c6335eb4761cb58ea809f0646d1dee2947d1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_pretrainer.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""BERT Pre-training model."""
-# pylint: disable=g-classes-have-attributes
-import collections
-import copy
-from typing import List, Optional
-
-from absl import logging
-import gin
-import tensorflow as tf
-
-from official.nlp.modeling import layers
-from official.nlp.modeling import networks
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class BertPretrainer(tf.keras.Model):
-  """BERT pretraining model.
-
-  [Note] Please use the new `BertPretrainerV2` for your projects.
-
-  The BertPretrainer allows a user to pass in a transformer stack, and
-  instantiates the masked language model and classification networks that are
-  used to create the training objectives.
-
-  *Note* that the model is constructed by
-  [Keras Functional API](https://keras.io/guides/functional_api/).
-
-  Args:
-    network: A transformer network. This network should output a sequence output
-      and a classification output.
-    num_classes: Number of classes to predict from the classification network.
-    num_token_predictions: Number of tokens to predict from the masked LM.
-    embedding_table: Embedding table of a network. If None, the
-      "network.get_embedding_table()" is used.
-    activation: The activation (if any) to use in the masked LM network. If
-      None, no activation will be used.
-    initializer: The initializer (if any) to use in the masked LM and
-      classification networks. Defaults to a Glorot uniform initializer.
-    output: The output style for this network. Can be either `logits` or
-      `predictions`.
-  """
-
-  def __init__(self,
-               network,
-               num_classes,
-               num_token_predictions,
-               embedding_table=None,
-               activation=None,
-               initializer='glorot_uniform',
-               output='logits',
-               **kwargs):
-
-    # We want to use the inputs of the passed network as the inputs to this
-    # Model. To do this, we need to keep a copy of the network inputs for use
-    # when we construct the Model object at the end of init. (We keep a copy
-    # because we'll be adding another tensor to the copy later.)
-    network_inputs = network.inputs
-    inputs = copy.copy(network_inputs)
-
-    # Because we have a copy of inputs to create this Model object, we can
-    # invoke the Network object with its own input tensors to start the Model.
-    # Note that, because of how deferred construction happens, we can't use
-    # the copy of the list here - by the time the network is invoked, the list
-    # object contains the additional input added below.
-    sequence_output, cls_output = network(network_inputs)
-
-    # The encoder network may get outputs from all layers.
-    if isinstance(sequence_output, list):
-      sequence_output = sequence_output[-1]
-    if isinstance(cls_output, list):
-      cls_output = cls_output[-1]
-    sequence_output_length = sequence_output.shape.as_list()[1]
-    if sequence_output_length is not None and (sequence_output_length <
-                                               num_token_predictions):
-      raise ValueError(
-          "The passed network's output length is %s, which is less than the "
-          'requested num_token_predictions %s.' %
-          (sequence_output_length, num_token_predictions))
-
-    masked_lm_positions = tf.keras.layers.Input(
-        shape=(num_token_predictions,),
-        name='masked_lm_positions',
-        dtype=tf.int32)
-    inputs.append(masked_lm_positions)
-
-    if embedding_table is None:
-      embedding_table = network.get_embedding_table()
-    masked_lm = layers.MaskedLM(
-        embedding_table=embedding_table,
-        activation=activation,
-        initializer=initializer,
-        output=output,
-        name='cls/predictions')
-    lm_outputs = masked_lm(
-        sequence_output, masked_positions=masked_lm_positions)
-
-    classification = networks.Classification(
-        input_width=cls_output.shape[-1],
-        num_classes=num_classes,
-        initializer=initializer,
-        output=output,
-        name='classification')
-    sentence_outputs = classification(cls_output)
-
-    super(BertPretrainer, self).__init__(
-        inputs=inputs,
-        outputs=dict(masked_lm=lm_outputs, classification=sentence_outputs),
-        **kwargs)
-
-    # b/164516224
-    # Once we've created the network using the Functional API, we call
-    # super().__init__ as though we were invoking the Functional API Model
-    # constructor, resulting in this object having all the properties of a model
-    # created using the Functional API. Once super().__init__ is called, we
-    # can assign attributes to `self` - note that all `self` assignments are
-    # below this line.
-    config_dict = {
-        'network': network,
-        'num_classes': num_classes,
-        'num_token_predictions': num_token_predictions,
-        'activation': activation,
-        'initializer': initializer,
-        'output': output,
-    }
-
-    # We are storing the config dict as a namedtuple here to ensure checkpoint
-    # compatibility with an earlier version of this model which did not track
-    # the config dict attribute. TF does not track immutable attrs which
-    # do not contain Trackables, so by creating a config namedtuple instead of
-    # a dict we avoid tracking it.
-    config_cls = collections.namedtuple('Config', config_dict.keys())
-    self._config = config_cls(**config_dict)
-
-    self.encoder = network
-    self.classification = classification
-    self.masked_lm = masked_lm
-
-  def get_config(self):
-    return dict(self._config._asdict())
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-@gin.configurable
-class BertPretrainerV2(tf.keras.Model):
-  """BERT pretraining model V2.
-
-  Adds the masked language model head and optional classification heads upon the
-  transformer encoder.
-
-  Args:
-    encoder_network: A transformer network. This network should output a
-      sequence output and a classification output.
-    mlm_activation: The activation (if any) to use in the masked LM network. If
-      None, no activation will be used.
-    mlm_initializer: The initializer (if any) to use in the masked LM. Default
-      to a Glorot uniform initializer.
-    classification_heads: A list of optional head layers to transform on encoder
-      sequence outputs.
-    customized_masked_lm: A customized masked_lm layer. If None, will create
-      a standard layer from `layers.MaskedLM`; if not None, will use the
-      specified masked_lm layer. Above arguments `mlm_activation` and
-      `mlm_initializer` will be ignored.
-    name: The name of the model.
-  Inputs: Inputs defined by the encoder network, plus `masked_lm_positions` as a
-    dictionary.
-  Outputs: A dictionary of `lm_output`, classification head outputs keyed by
-    head names, and also outputs from `encoder_network`, keyed by
-    `sequence_output` and `encoder_outputs` (if any).
-  """
-
-  def __init__(
-      self,
-      encoder_network: tf.keras.Model,
-      mlm_activation=None,
-      mlm_initializer='glorot_uniform',
-      classification_heads: Optional[List[tf.keras.layers.Layer]] = None,
-      customized_masked_lm: Optional[tf.keras.layers.Layer] = None,
-      name: str = 'bert',
-      **kwargs):
-    super().__init__(self, name=name, **kwargs)
-    self._config = {
-        'encoder_network': encoder_network,
-        'mlm_initializer': mlm_initializer,
-        'classification_heads': classification_heads,
-        'name': name,
-    }
-    self.encoder_network = encoder_network
-    inputs = copy.copy(self.encoder_network.inputs)
-    self.classification_heads = classification_heads or []
-    if len(set([cls.name for cls in self.classification_heads])) != len(
-        self.classification_heads):
-      raise ValueError('Classification heads should have unique names.')
-
-    self.masked_lm = customized_masked_lm or layers.MaskedLM(
-        embedding_table=self.encoder_network.get_embedding_table(),
-        activation=mlm_activation,
-        initializer=mlm_initializer,
-        name='cls/predictions')
-    masked_lm_positions = tf.keras.layers.Input(
-        shape=(None,), name='masked_lm_positions', dtype=tf.int32)
-    inputs.append(masked_lm_positions)
-    self.inputs = inputs
-
-  def call(self, inputs):
-    if isinstance(inputs, list):
-      logging.warning('List inputs to BertPretrainer are discouraged.')
-      inputs = dict([
-          (ref.name, tensor) for ref, tensor in zip(self.inputs, inputs)
-      ])
-
-    outputs = dict()
-    encoder_network_outputs = self.encoder_network(inputs)
-    if isinstance(encoder_network_outputs, list):
-      outputs['pooled_output'] = encoder_network_outputs[1]
-      # When `encoder_network` was instantiated with return_all_encoder_outputs
-      # set to True, `encoder_network_outputs[0]` is a list containing
-      # all transformer layers' output.
-      if isinstance(encoder_network_outputs[0], list):
-        outputs['encoder_outputs'] = encoder_network_outputs[0]
-        outputs['sequence_output'] = encoder_network_outputs[0][-1]
-      else:
-        outputs['sequence_output'] = encoder_network_outputs[0]
-    elif isinstance(encoder_network_outputs, dict):
-      outputs = encoder_network_outputs
-    else:
-      raise ValueError('encoder_network\'s output should be either a list '
-                       'or a dict, but got %s' % encoder_network_outputs)
-    sequence_output = outputs['sequence_output']
-    # Inference may not have masked_lm_positions and mlm_logits is not needed.
-    if 'masked_lm_positions' in inputs:
-      masked_lm_positions = inputs['masked_lm_positions']
-      outputs['mlm_logits'] = self.masked_lm(
-          sequence_output, masked_positions=masked_lm_positions)
-    for cls_head in self.classification_heads:
-      cls_outputs = cls_head(sequence_output)
-      if isinstance(cls_outputs, dict):
-        outputs.update(cls_outputs)
-      else:
-        outputs[cls_head.name] = cls_outputs
-    return outputs
-
-  @property
-  def checkpoint_items(self):
-    """Returns a dictionary of items to be additionally checkpointed."""
-    items = dict(encoder=self.encoder_network, masked_lm=self.masked_lm)
-    for head in self.classification_heads:
-      for key, item in head.checkpoint_items.items():
-        items['.'.join([head.name, key])] = item
-    return items
-
-  def get_config(self):
-    return self._config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_pretrainer_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_pretrainer_test.py
deleted file mode 100644
index 80f0d40a4b75175873d42f746233312fdf75fd3b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_pretrainer_test.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for BERT pretrainer model."""
-import itertools
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling import layers
-from official.nlp.modeling import networks
-from official.nlp.modeling.models import bert_pretrainer
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class BertPretrainerTest(keras_parameterized.TestCase):
-
-  def test_bert_pretrainer(self):
-    """Validate that the Keras object can be created."""
-    # Build a transformer network to use within the BERT trainer.
-    vocab_size = 100
-    sequence_length = 512
-    test_network = networks.BertEncoder(
-        vocab_size=vocab_size,
-        num_layers=2,
-        max_sequence_length=sequence_length)
-
-    # Create a BERT trainer with the created network.
-    num_classes = 3
-    num_token_predictions = 2
-    bert_trainer_model = bert_pretrainer.BertPretrainer(
-        test_network,
-        num_classes=num_classes,
-        num_token_predictions=num_token_predictions)
-
-    # Create a set of 2-dimensional inputs (the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    masked_lm_positions = tf.keras.Input(
-        shape=(num_token_predictions,), dtype=tf.int32)
-
-    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    outputs = bert_trainer_model(
-        [word_ids, mask, type_ids, masked_lm_positions])
-
-    # Validate that the outputs are of the expected shape.
-    expected_lm_shape = [None, num_token_predictions, vocab_size]
-    expected_classification_shape = [None, num_classes]
-    self.assertAllEqual(expected_lm_shape, outputs['masked_lm'].shape.as_list())
-    self.assertAllEqual(expected_classification_shape,
-                        outputs['classification'].shape.as_list())
-
-  def test_bert_trainer_tensor_call(self):
-    """Validate that the Keras object can be invoked."""
-    # Build a transformer network to use within the BERT trainer. (Here, we use
-    # a short sequence_length for convenience.)
-    test_network = networks.BertEncoder(
-        vocab_size=100, num_layers=2, sequence_length=2)
-
-    # Create a BERT trainer with the created network.
-    bert_trainer_model = bert_pretrainer.BertPretrainer(
-        test_network, num_classes=2, num_token_predictions=2)
-
-    # Create a set of 2-dimensional data tensors to feed into the model.
-    word_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
-    mask = tf.constant([[1, 1], [1, 0]], dtype=tf.int32)
-    type_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
-    lm_mask = tf.constant([[1, 1], [1, 0]], dtype=tf.int32)
-
-    # Invoke the trainer model on the tensors. In Eager mode, this does the
-    # actual calculation. (We can't validate the outputs, since the network is
-    # too complex: this simply ensures we're not hitting runtime errors.)
-    _ = bert_trainer_model([word_ids, mask, type_ids, lm_mask])
-
-  def test_serialize_deserialize(self):
-    """Validate that the BERT trainer can be serialized and deserialized."""
-    # Build a transformer network to use within the BERT trainer. (Here, we use
-    # a short sequence_length for convenience.)
-    test_network = networks.BertEncoder(
-        vocab_size=100, num_layers=2, max_sequence_length=5)
-
-    # Create a BERT trainer with the created network. (Note that all the args
-    # are different, so we can catch any serialization mismatches.)
-    bert_trainer_model = bert_pretrainer.BertPretrainer(
-        test_network, num_classes=4, num_token_predictions=3)
-
-    # Create another BERT trainer via serialization and deserialization.
-    config = bert_trainer_model.get_config()
-    new_bert_trainer_model = bert_pretrainer.BertPretrainer.from_config(config)
-
-    # Validate that the config can be forced to JSON.
-    _ = new_bert_trainer_model.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(bert_trainer_model.get_config(),
-                        new_bert_trainer_model.get_config())
-
-
-class BertPretrainerV2Test(keras_parameterized.TestCase):
-
-  @parameterized.parameters(itertools.product(
-      (False, True),
-      (False, True),
-      (False, True),
-      (False, True),
-  ))
-  def test_bert_pretrainerv2(self, dict_outputs, return_all_encoder_outputs,
-                             use_customized_masked_lm, has_masked_lm_positions):
-    """Validate that the Keras object can be created."""
-    # Build a transformer network to use within the BERT trainer.
-    vocab_size = 100
-    sequence_length = 512
-    hidden_size = 48
-    num_layers = 2
-    test_network = networks.BertEncoder(
-        vocab_size=vocab_size,
-        num_layers=num_layers,
-        hidden_size=hidden_size,
-        max_sequence_length=sequence_length,
-        return_all_encoder_outputs=return_all_encoder_outputs,
-        dict_outputs=dict_outputs)
-
-    # Create a BERT trainer with the created network.
-    if use_customized_masked_lm:
-      customized_masked_lm = layers.MaskedLM(
-          embedding_table=test_network.get_embedding_table())
-    else:
-      customized_masked_lm = None
-
-    bert_trainer_model = bert_pretrainer.BertPretrainerV2(
-        encoder_network=test_network, customized_masked_lm=customized_masked_lm)
-    num_token_predictions = 20
-    # Create a set of 2-dimensional inputs (the first dimension is implicit).
-    inputs = dict(
-        input_word_ids=tf.keras.Input(shape=(sequence_length,), dtype=tf.int32),
-        input_mask=tf.keras.Input(shape=(sequence_length,), dtype=tf.int32),
-        input_type_ids=tf.keras.Input(shape=(sequence_length,), dtype=tf.int32))
-    if has_masked_lm_positions:
-      inputs['masked_lm_positions'] = tf.keras.Input(
-          shape=(num_token_predictions,), dtype=tf.int32)
-
-    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    outputs = bert_trainer_model(inputs)
-
-    has_encoder_outputs = dict_outputs or return_all_encoder_outputs
-    expected_keys = ['sequence_output', 'pooled_output']
-    if has_encoder_outputs:
-      expected_keys.append('encoder_outputs')
-    if has_masked_lm_positions:
-      expected_keys.append('mlm_logits')
-
-    self.assertSameElements(outputs.keys(), expected_keys)
-    # Validate that the outputs are of the expected shape.
-    expected_lm_shape = [None, num_token_predictions, vocab_size]
-    if has_masked_lm_positions:
-      self.assertAllEqual(expected_lm_shape,
-                          outputs['mlm_logits'].shape.as_list())
-
-    expected_sequence_output_shape = [None, sequence_length, hidden_size]
-    self.assertAllEqual(expected_sequence_output_shape,
-                        outputs['sequence_output'].shape.as_list())
-
-    expected_pooled_output_shape = [None, hidden_size]
-    self.assertAllEqual(expected_pooled_output_shape,
-                        outputs['pooled_output'].shape.as_list())
-
-  def test_multiple_cls_outputs(self):
-    """Validate that the Keras object can be created."""
-    # Build a transformer network to use within the BERT trainer.
-    vocab_size = 100
-    sequence_length = 512
-    hidden_size = 48
-    num_layers = 2
-    test_network = networks.BertEncoder(
-        vocab_size=vocab_size,
-        num_layers=num_layers,
-        hidden_size=hidden_size,
-        max_sequence_length=sequence_length,
-        dict_outputs=True)
-
-    bert_trainer_model = bert_pretrainer.BertPretrainerV2(
-        encoder_network=test_network,
-        classification_heads=[layers.MultiClsHeads(
-            inner_dim=5, cls_list=[('foo', 2), ('bar', 3)])])
-    num_token_predictions = 20
-    # Create a set of 2-dimensional inputs (the first dimension is implicit).
-    inputs = dict(
-        input_word_ids=tf.keras.Input(shape=(sequence_length,), dtype=tf.int32),
-        input_mask=tf.keras.Input(shape=(sequence_length,), dtype=tf.int32),
-        input_type_ids=tf.keras.Input(shape=(sequence_length,), dtype=tf.int32),
-        masked_lm_positions=tf.keras.Input(
-            shape=(num_token_predictions,), dtype=tf.int32))
-
-    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    outputs = bert_trainer_model(inputs)
-    self.assertEqual(outputs['foo'].shape.as_list(), [None, 2])
-    self.assertEqual(outputs['bar'].shape.as_list(), [None, 3])
-
-  def test_v2_serialize_deserialize(self):
-    """Validate that the BERT trainer can be serialized and deserialized."""
-    # Build a transformer network to use within the BERT trainer. (Here, we use
-    # a short sequence_length for convenience.)
-    test_network = networks.BertEncoder(
-        vocab_size=100, num_layers=2, sequence_length=5)
-
-    # Create a BERT trainer with the created network. (Note that all the args
-    # are different, so we can catch any serialization mismatches.)
-    bert_trainer_model = bert_pretrainer.BertPretrainerV2(
-        encoder_network=test_network)
-
-    # Create another BERT trainer via serialization and deserialization.
-    config = bert_trainer_model.get_config()
-    new_bert_trainer_model = bert_pretrainer.BertPretrainerV2.from_config(
-        config)
-
-    # Validate that the config can be forced to JSON.
-    _ = new_bert_trainer_model.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(bert_trainer_model.get_config(),
-                        new_bert_trainer_model.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_span_labeler.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_span_labeler.py
deleted file mode 100644
index 59d8f0524a2583964f1badaff023edcb5747a58c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_span_labeler.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""BERT Question Answering model."""
-# pylint: disable=g-classes-have-attributes
-import collections
-import tensorflow as tf
-
-from official.nlp.modeling import networks
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class BertSpanLabeler(tf.keras.Model):
-  """Span labeler model based on a BERT-style transformer-based encoder.
-
-  This is an implementation of the network structure surrounding a transformer
-  encoder as described in "BERT: Pre-training of Deep Bidirectional Transformers
-  for Language Understanding" (https://arxiv.org/abs/1810.04805).
-
-  The BertSpanLabeler allows a user to pass in a transformer encoder, and
-  instantiates a span labeling network based on a single dense layer.
-
-  *Note* that the model is constructed by
-  [Keras Functional API](https://keras.io/guides/functional_api/).
-
-  Args:
-    network: A transformer network. This network should output a sequence output
-      and a classification output. Furthermore, it should expose its embedding
-      table via a `get_embedding_table` method.
-    initializer: The initializer (if any) to use in the span labeling network.
-      Defaults to a Glorot uniform initializer.
-    output: The output style for this network. Can be either `logit`' or
-      `predictions`.
-  """
-
-  def __init__(self,
-               network,
-               initializer='glorot_uniform',
-               output='logits',
-               **kwargs):
-
-    # We want to use the inputs of the passed network as the inputs to this
-    # Model. To do this, we need to keep a handle to the network inputs for use
-    # when we construct the Model object at the end of init.
-    inputs = network.inputs
-
-    # Because we have a copy of inputs to create this Model object, we can
-    # invoke the Network object with its own input tensors to start the Model.
-    outputs = network(inputs)
-    if isinstance(outputs, list):
-      sequence_output = outputs[0]
-    else:
-      sequence_output = outputs['sequence_output']
-
-    # The input network (typically a transformer model) may get outputs from all
-    # layers. When this case happens, we retrieve the last layer output.
-    if isinstance(sequence_output, list):
-      sequence_output = sequence_output[-1]
-
-    # This is an instance variable for ease of access to the underlying task
-    # network.
-    span_labeling = networks.SpanLabeling(
-        input_width=sequence_output.shape[-1],
-        initializer=initializer,
-        output=output,
-        name='span_labeling')
-    start_logits, end_logits = span_labeling(sequence_output)
-
-    # Use identity layers wrapped in lambdas to explicitly name the output
-    # tensors. This allows us to use string-keyed dicts in Keras fit/predict/
-    # evaluate calls.
-    start_logits = tf.keras.layers.Lambda(
-        tf.identity, name='start_positions')(
-            start_logits)
-    end_logits = tf.keras.layers.Lambda(
-        tf.identity, name='end_positions')(
-            end_logits)
-
-    logits = [start_logits, end_logits]
-
-    # b/164516224
-    # Once we've created the network using the Functional API, we call
-    # super().__init__ as though we were invoking the Functional API Model
-    # constructor, resulting in this object having all the properties of a model
-    # created using the Functional API. Once super().__init__ is called, we
-    # can assign attributes to `self` - note that all `self` assignments are
-    # below this line.
-    super(BertSpanLabeler, self).__init__(
-        inputs=inputs, outputs=logits, **kwargs)
-    self._network = network
-    config_dict = {
-        'network': network,
-        'initializer': initializer,
-        'output': output,
-    }
-    # We are storing the config dict as a namedtuple here to ensure checkpoint
-    # compatibility with an earlier version of this model which did not track
-    # the config dict attribute. TF does not track immutable attrs which
-    # do not contain Trackables, so by creating a config namedtuple instead of
-    # a dict we avoid tracking it.
-    config_cls = collections.namedtuple('Config', config_dict.keys())
-    self._config = config_cls(**config_dict)
-    self.span_labeling = span_labeling
-
-  @property
-  def checkpoint_items(self):
-    return dict(encoder=self._network)
-
-  def get_config(self):
-    return dict(self._config._asdict())
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_span_labeler_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_span_labeler_test.py
deleted file mode 100644
index aa2d26ecab83933f842e35698a9a6a5b5e198ad2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_span_labeler_test.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for BERT trainer network."""
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling import networks
-from official.nlp.modeling.models import bert_span_labeler
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class BertSpanLabelerTest(keras_parameterized.TestCase):
-
-  @parameterized.parameters(True, False)
-  def test_bert_trainer(self, dict_outputs):
-    """Validate that the Keras object can be created."""
-    # Build a transformer network to use within the BERT trainer.
-    vocab_size = 100
-    sequence_length = 512
-    test_network = networks.BertEncoder(
-        vocab_size=vocab_size, num_layers=2, dict_outputs=dict_outputs)
-
-    # Create a BERT trainer with the created network.
-    bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network)
-
-    # Create a set of 2-dimensional inputs (the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-
-    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    cls_outs = bert_trainer_model([word_ids, mask, type_ids])
-
-    # Validate that there are 2 outputs are of the expected shape.
-    self.assertEqual(2, len(cls_outs))
-    expected_shape = [None, sequence_length]
-    for out in cls_outs:
-      self.assertAllEqual(expected_shape, out.shape.as_list())
-
-  def test_bert_trainer_named_compilation(self):
-    """Validate compilation using explicit output names."""
-    # Build a transformer network to use within the BERT trainer.
-    vocab_size = 100
-    test_network = networks.BertEncoder(vocab_size=vocab_size, num_layers=2)
-
-    # Create a BERT trainer with the created network.
-    bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network)
-
-    # Attempt to compile the model using a string-keyed dict of output names to
-    # loss functions. This will validate that the outputs are named as we
-    # expect.
-    bert_trainer_model.compile(
-        optimizer='sgd',
-        loss={
-            'start_positions': 'mse',
-            'end_positions': 'mse'
-        })
-
-  def test_bert_trainer_tensor_call(self):
-    """Validate that the Keras object can be invoked."""
-    # Build a transformer network to use within the BERT trainer. (Here, we use
-    # a short sequence_length for convenience.)
-    test_network = networks.BertEncoder(vocab_size=100, num_layers=2)
-
-    # Create a BERT trainer with the created network.
-    bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network)
-
-    # Create a set of 2-dimensional data tensors to feed into the model.
-    word_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
-    mask = tf.constant([[1, 1], [1, 0]], dtype=tf.int32)
-    type_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
-
-    # Invoke the trainer model on the tensors. In Eager mode, this does the
-    # actual calculation. (We can't validate the outputs, since the network is
-    # too complex: this simply ensures we're not hitting runtime errors.)
-    _ = bert_trainer_model([word_ids, mask, type_ids])
-
-  def test_serialize_deserialize(self):
-    """Validate that the BERT trainer can be serialized and deserialized."""
-    # Build a transformer network to use within the BERT trainer. (Here, we use
-    # a short sequence_length for convenience.)
-    test_network = networks.BertEncoder(
-        vocab_size=100, num_layers=2, sequence_length=5)
-
-    # Create a BERT trainer with the created network. (Note that all the args
-    # are different, so we can catch any serialization mismatches.)
-    bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network)
-
-    # Create another BERT trainer via serialization and deserialization.
-    config = bert_trainer_model.get_config()
-    new_bert_trainer_model = bert_span_labeler.BertSpanLabeler.from_config(
-        config)
-
-    # Validate that the config can be forced to JSON.
-    _ = new_bert_trainer_model.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(bert_trainer_model.get_config(),
-                        new_bert_trainer_model.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_token_classifier.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_token_classifier.py
deleted file mode 100644
index 77485a4908c36e203728fe665d10c5b57a923cf5..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_token_classifier.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""BERT token classifier."""
-# pylint: disable=g-classes-have-attributes
-import collections
-import tensorflow as tf
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class BertTokenClassifier(tf.keras.Model):
-  """Token classifier model based on a BERT-style transformer-based encoder.
-
-  This is an implementation of the network structure surrounding a transformer
-  encoder as described in "BERT: Pre-training of Deep Bidirectional Transformers
-  for Language Understanding" (https://arxiv.org/abs/1810.04805).
-
-  The BertTokenClassifier allows a user to pass in a transformer stack, and
-  instantiates a token classification network based on the passed `num_classes`
-  argument.
-
-  *Note* that the model is constructed by
-  [Keras Functional API](https://keras.io/guides/functional_api/).
-
-  Args:
-    network: A transformer network. This network should output a sequence output
-      and a classification output. Furthermore, it should expose its embedding
-      table via a `get_embedding_table` method.
-    num_classes: Number of classes to predict from the classification network.
-    initializer: The initializer (if any) to use in the classification networks.
-      Defaults to a Glorot uniform initializer.
-    output: The output style for this network. Can be either `logits` or
-      `predictions`.
-    dropout_rate: The dropout probability of the token classification head.
-    output_encoder_outputs: Whether to include intermediate sequence output
-      in the final output.
-  """
-
-  def __init__(self,
-               network,
-               num_classes,
-               initializer='glorot_uniform',
-               output='logits',
-               dropout_rate=0.1,
-               output_encoder_outputs=False,
-               **kwargs):
-
-    # We want to use the inputs of the passed network as the inputs to this
-    # Model. To do this, we need to keep a handle to the network inputs for use
-    # when we construct the Model object at the end of init.
-    inputs = network.inputs
-
-    # Because we have a copy of inputs to create this Model object, we can
-    # invoke the Network object with its own input tensors to start the Model.
-    outputs = network(inputs)
-    if isinstance(outputs, list):
-      sequence_output = outputs[0]
-    else:
-      sequence_output = outputs['sequence_output']
-    sequence_output = tf.keras.layers.Dropout(rate=dropout_rate)(
-        sequence_output)
-
-    classifier = tf.keras.layers.Dense(
-        num_classes,
-        activation=None,
-        kernel_initializer=initializer,
-        name='predictions/transform/logits')
-    logits = classifier(sequence_output)
-    if output == 'logits':
-      output_tensors = {'logits': logits}
-    elif output == 'predictions':
-      output_tensors = {
-          'predictions': tf.keras.layers.Activation(tf.nn.log_softmax)(logits)
-      }
-    else:
-      raise ValueError(
-          ('Unknown `output` value "%s". `output` can be either "logits" or '
-           '"predictions"') % output)
-
-    if output_encoder_outputs:
-      output_tensors['encoder_outputs'] = sequence_output
-
-    # b/164516224
-    # Once we've created the network using the Functional API, we call
-    # super().__init__ as though we were invoking the Functional API Model
-    # constructor, resulting in this object having all the properties of a model
-    # created using the Functional API. Once super().__init__ is called, we
-    # can assign attributes to `self` - note that all `self` assignments are
-    # below this line.
-    super(BertTokenClassifier, self).__init__(
-        inputs=inputs, outputs=output_tensors, **kwargs)
-
-    self._network = network
-    config_dict = {
-        'network': network,
-        'num_classes': num_classes,
-        'initializer': initializer,
-        'output': output,
-        'output_encoder_outputs': output_encoder_outputs
-    }
-
-    # We are storing the config dict as a namedtuple here to ensure checkpoint
-    # compatibility with an earlier version of this model which did not track
-    # the config dict attribute. TF does not track immutable attrs which
-    # do not contain Trackables, so by creating a config namedtuple instead of
-    # a dict we avoid tracking it.
-    config_cls = collections.namedtuple('Config', config_dict.keys())
-    self._config = config_cls(**config_dict)
-
-    self.classifier = classifier
-    self.logits = logits
-
-  @property
-  def checkpoint_items(self):
-    return dict(encoder=self._network)
-
-  def get_config(self):
-    return dict(self._config._asdict())
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_token_classifier_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_token_classifier_test.py
deleted file mode 100644
index 1339a5ac8e5b83276fba116d7bece968f0a28518..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/bert_token_classifier_test.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for BERT token classifier."""
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling import networks
-from official.nlp.modeling.models import bert_token_classifier
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class BertTokenClassifierTest(keras_parameterized.TestCase):
-
-  @parameterized.parameters((True, True), (False, False))
-  def test_bert_trainer(self, dict_outputs, output_encoder_outputs):
-    """Validate that the Keras object can be created."""
-    # Build a transformer network to use within the BERT trainer.
-    vocab_size = 100
-    sequence_length = 512
-    hidden_size = 768
-    test_network = networks.BertEncoder(
-        vocab_size=vocab_size,
-        num_layers=2,
-        max_sequence_length=sequence_length,
-        dict_outputs=dict_outputs,
-        hidden_size=hidden_size)
-
-    # Create a BERT trainer with the created network.
-    num_classes = 3
-    bert_trainer_model = bert_token_classifier.BertTokenClassifier(
-        test_network,
-        num_classes=num_classes,
-        output_encoder_outputs=output_encoder_outputs)
-
-    # Create a set of 2-dimensional inputs (the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-
-    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    outputs = bert_trainer_model([word_ids, mask, type_ids])
-    if output_encoder_outputs:
-      logits = outputs['logits']
-      encoder_outputs = outputs['encoder_outputs']
-      self.assertAllEqual(encoder_outputs.shape.as_list(),
-                          [None, sequence_length, hidden_size])
-    else:
-      logits = outputs['logits']
-
-    # Validate that the outputs are of the expected shape.
-    expected_classification_shape = [None, sequence_length, num_classes]
-    self.assertAllEqual(expected_classification_shape, logits.shape.as_list())
-
-  def test_bert_trainer_tensor_call(self):
-    """Validate that the Keras object can be invoked."""
-    # Build a transformer network to use within the BERT trainer. (Here, we use
-    # a short sequence_length for convenience.)
-    test_network = networks.BertEncoder(
-        vocab_size=100, num_layers=2, max_sequence_length=2)
-
-    # Create a BERT trainer with the created network.
-    bert_trainer_model = bert_token_classifier.BertTokenClassifier(
-        test_network, num_classes=2)
-
-    # Create a set of 2-dimensional data tensors to feed into the model.
-    word_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
-    mask = tf.constant([[1, 1], [1, 0]], dtype=tf.int32)
-    type_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
-
-    # Invoke the trainer model on the tensors. In Eager mode, this does the
-    # actual calculation. (We can't validate the outputs, since the network is
-    # too complex: this simply ensures we're not hitting runtime errors.)
-    _ = bert_trainer_model([word_ids, mask, type_ids])
-
-  def test_serialize_deserialize(self):
-    """Validate that the BERT trainer can be serialized and deserialized."""
-    # Build a transformer network to use within the BERT trainer. (Here, we use
-    # a short sequence_length for convenience.)
-    test_network = networks.BertEncoder(
-        vocab_size=100, num_layers=2, max_sequence_length=5)
-
-    # Create a BERT trainer with the created network. (Note that all the args
-    # are different, so we can catch any serialization mismatches.)
-    bert_trainer_model = bert_token_classifier.BertTokenClassifier(
-        test_network, num_classes=4, initializer='zeros', output='predictions')
-
-    # Create another BERT trainer via serialization and deserialization.
-    config = bert_trainer_model.get_config()
-    new_bert_trainer_model = (
-        bert_token_classifier.BertTokenClassifier.from_config(config))
-
-    # Validate that the config can be forced to JSON.
-    _ = new_bert_trainer_model.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(bert_trainer_model.get_config(),
-                        new_bert_trainer_model.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/dual_encoder.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/dual_encoder.py
deleted file mode 100644
index e97f7924d7375c92708b8ce5e9f8706e15360679..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/dual_encoder.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Trainer network for dual encoder style models."""
-# pylint: disable=g-classes-have-attributes
-import collections
-import tensorflow as tf
-
-from official.nlp.modeling import layers
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class DualEncoder(tf.keras.Model):
-  """A dual encoder model based on a transformer-based encoder.
-
-  This is an implementation of the dual encoder network structure based on the
-  transfomer stack, as described in ["Language-agnostic BERT Sentence
-  Embedding"](https://arxiv.org/abs/2007.01852)
-
-  The DualEncoder allows a user to pass in a transformer stack, and build a dual
-  encoder model based on the transformer stack.
-
-  Args:
-    network: A transformer network which should output an encoding output.
-    max_seq_length: The maximum allowed sequence length for transformer.
-    normalize: If set to True, normalize the encoding produced by transfomer.
-    logit_scale: The scaling factor of dot products when doing training.
-    logit_margin: The margin between positive and negative when doing training.
-    output: The output style for this network. Can be either `logits` or
-      `predictions`. If set to `predictions`, it will output the embedding
-      producted by transformer network.
-  """
-
-  def __init__(self,
-               network: tf.keras.Model,
-               max_seq_length: int = 32,
-               normalize: bool = True,
-               logit_scale: float = 1.0,
-               logit_margin: float = 0.0,
-               output: str = 'logits',
-               **kwargs) -> None:
-
-    if output == 'logits':
-      left_word_ids = tf.keras.layers.Input(
-          shape=(max_seq_length,), dtype=tf.int32, name='left_word_ids')
-      left_mask = tf.keras.layers.Input(
-          shape=(max_seq_length,), dtype=tf.int32, name='left_mask')
-      left_type_ids = tf.keras.layers.Input(
-          shape=(max_seq_length,), dtype=tf.int32, name='left_type_ids')
-    else:
-      # Keep the consistant with legacy BERT hub module input names.
-      left_word_ids = tf.keras.layers.Input(
-          shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
-      left_mask = tf.keras.layers.Input(
-          shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
-      left_type_ids = tf.keras.layers.Input(
-          shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
-
-    left_inputs = [left_word_ids, left_mask, left_type_ids]
-    left_outputs = network(left_inputs)
-    if isinstance(left_outputs, list):
-      left_sequence_output, left_encoded = left_outputs
-    else:
-      left_sequence_output = left_outputs['sequence_output']
-      left_encoded = left_outputs['pooled_output']
-    if normalize:
-      left_encoded = tf.keras.layers.Lambda(
-          lambda x: tf.nn.l2_normalize(x, axis=1))(
-              left_encoded)
-
-    if output == 'logits':
-      right_word_ids = tf.keras.layers.Input(
-          shape=(max_seq_length,), dtype=tf.int32, name='right_word_ids')
-      right_mask = tf.keras.layers.Input(
-          shape=(max_seq_length,), dtype=tf.int32, name='right_mask')
-      right_type_ids = tf.keras.layers.Input(
-          shape=(max_seq_length,), dtype=tf.int32, name='right_type_ids')
-
-      right_inputs = [right_word_ids, right_mask, right_type_ids]
-      right_outputs = network(right_inputs)
-      if isinstance(right_outputs, list):
-        _, right_encoded = right_outputs
-      else:
-        right_encoded = right_outputs['pooled_output']
-      if normalize:
-        right_encoded = tf.keras.layers.Lambda(
-            lambda x: tf.nn.l2_normalize(x, axis=1))(
-                right_encoded)
-
-      dot_products = layers.MatMulWithMargin(
-          logit_scale=logit_scale,
-          logit_margin=logit_margin,
-          name='dot_product')
-
-      inputs = [
-          left_word_ids, left_mask, left_type_ids, right_word_ids, right_mask,
-          right_type_ids
-      ]
-      left_logits, right_logits = dot_products(left_encoded, right_encoded)
-
-      outputs = dict(left_logits=left_logits, right_logits=right_logits)
-
-    elif output == 'predictions':
-      inputs = [left_word_ids, left_mask, left_type_ids]
-
-      # To keep consistent with legacy BERT hub modules, the outputs are
-      # "pooled_output" and "sequence_output".
-      outputs = dict(
-          sequence_output=left_sequence_output, pooled_output=left_encoded)
-    else:
-      raise ValueError('output type %s is not supported' % output)
-
-    # b/164516224
-    # Once we've created the network using the Functional API, we call
-    # super().__init__ as though we were invoking the Functional API Model
-    # constructor, resulting in this object having all the properties of a model
-    # created using the Functional API. Once super().__init__ is called, we
-    # can assign attributes to `self` - note that all `self` assignments are
-    # below this line.
-    super(DualEncoder, self).__init__(inputs=inputs, outputs=outputs, **kwargs)
-
-    config_dict = {
-        'network': network,
-        'max_seq_length': max_seq_length,
-        'normalize': normalize,
-        'logit_scale': logit_scale,
-        'logit_margin': logit_margin,
-        'output': output,
-    }
-    # We are storing the config dict as a namedtuple here to ensure checkpoint
-    # compatibility with an earlier version of this model which did not track
-    # the config dict attribute. TF does not track immutable attrs which
-    # do not contain Trackables, so by creating a config namedtuple instead of
-    # a dict we avoid tracking it.
-    config_cls = collections.namedtuple('Config', config_dict.keys())
-    self._config = config_cls(**config_dict)
-
-    self.network = network
-
-  def get_config(self):
-    return dict(self._config._asdict())
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-  @property
-  def checkpoint_items(self):
-    """Returns a dictionary of items to be additionally checkpointed."""
-    items = dict(encoder=self.network)
-    return items
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/dual_encoder_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/dual_encoder_test.py
deleted file mode 100644
index 44bf8d1c4eede8a2e27dd124ab194eda8abc0fe6..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/dual_encoder_test.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for dual encoder network."""
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling import networks
-from official.nlp.modeling.models import dual_encoder
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class DualEncoderTest(keras_parameterized.TestCase):
-
-  @parameterized.parameters((192, 'logits'), (768, 'predictions'))
-  def test_dual_encoder(self, hidden_size, output):
-    """Validate that the Keras object can be created."""
-    # Build a transformer network to use within the dual encoder model.
-    vocab_size = 100
-    sequence_length = 512
-    test_network = networks.BertEncoder(
-        vocab_size=vocab_size,
-        num_layers=2,
-        hidden_size=hidden_size,
-        sequence_length=sequence_length,
-        dict_outputs=True)
-
-    # Create a dual encoder model with the created network.
-    dual_encoder_model = dual_encoder.DualEncoder(
-        test_network, max_seq_length=sequence_length, output=output)
-
-    # Create a set of 2-dimensional inputs (the first dimension is implicit).
-    left_word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    left_mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    left_type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-
-    right_word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    right_mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    right_type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-
-    if output == 'logits':
-      outputs = dual_encoder_model([
-          left_word_ids, left_mask, left_type_ids, right_word_ids, right_mask,
-          right_type_ids
-      ])
-      _ = outputs['left_logits']
-    elif output == 'predictions':
-      outputs = dual_encoder_model([left_word_ids, left_mask, left_type_ids])
-      # Validate that the outputs are of the expected shape.
-      expected_sequence_shape = [None, sequence_length, 768]
-      self.assertAllEqual(expected_sequence_shape,
-                          outputs['sequence_output'].shape.as_list())
-      left_encoded = outputs['pooled_output']
-      expected_encoding_shape = [None, 768]
-      self.assertAllEqual(expected_encoding_shape, left_encoded.shape.as_list())
-
-  @parameterized.parameters((192, 'logits'), (768, 'predictions'))
-  def test_dual_encoder_tensor_call(self, hidden_size, output):
-    """Validate that the Keras object can be invoked."""
-    # Build a transformer network to use within the dual encoder model. (Here,
-    # we use # a short sequence_length for convenience.)
-    sequence_length = 2
-    test_network = networks.BertEncoder(
-        vocab_size=100, num_layers=2, sequence_length=sequence_length)
-
-    # Create a dual encoder model with the created network.
-    dual_encoder_model = dual_encoder.DualEncoder(
-        test_network, max_seq_length=sequence_length, output=output)
-
-    # Create a set of 2-dimensional data tensors to feed into the model.
-    word_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
-    mask = tf.constant([[1, 1], [1, 0]], dtype=tf.int32)
-    type_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32)
-
-    # Invoke the model model on the tensors. In Eager mode, this does the
-    # actual calculation. (We can't validate the outputs, since the network is
-    # too complex: this simply ensures we're not hitting runtime errors.)
-    if output == 'logits':
-      _ = dual_encoder_model(
-          [word_ids, mask, type_ids, word_ids, mask, type_ids])
-    elif output == 'predictions':
-      _ = dual_encoder_model([word_ids, mask, type_ids])
-
-  def test_serialize_deserialize(self):
-    """Validate that the dual encoder model can be serialized / deserialized."""
-    # Build a transformer network to use within the dual encoder model. (Here,
-    # we use a short sequence_length for convenience.)
-    sequence_length = 32
-    test_network = networks.BertEncoder(
-        vocab_size=100, num_layers=2, sequence_length=sequence_length)
-
-    # Create a dual encoder model with the created network. (Note that all the
-    # args are different, so we can catch any serialization mismatches.)
-    dual_encoder_model = dual_encoder.DualEncoder(
-        test_network, max_seq_length=sequence_length, output='predictions')
-
-    # Create another dual encoder model via serialization and deserialization.
-    config = dual_encoder_model.get_config()
-    new_dual_encoder = dual_encoder.DualEncoder.from_config(config)
-
-    # Validate that the config can be forced to JSON.
-    _ = new_dual_encoder.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(dual_encoder_model.get_config(),
-                        new_dual_encoder.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/electra_pretrainer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/electra_pretrainer.py
deleted file mode 100644
index dbad95e2d4f0d3d86b39e3701a932d37bc53f536..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/electra_pretrainer.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Trainer network for ELECTRA models."""
-# pylint: disable=g-classes-have-attributes
-
-import copy
-
-import tensorflow as tf
-
-from official.modeling import tf_utils
-from official.nlp.modeling import layers
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class ElectraPretrainer(tf.keras.Model):
-  """ELECTRA network training model.
-
-  This is an implementation of the network structure described in "ELECTRA:
-  Pre-training Text Encoders as Discriminators Rather Than Generators" (
-  https://arxiv.org/abs/2003.10555).
-
-  The ElectraPretrainer allows a user to pass in two transformer models, one for
-  generator, the other for discriminator, and instantiates the masked language
-  model (at generator side) and classification networks (at discriminator side)
-  that are used to create the training objectives.
-
-  *Note* that the model is constructed by Keras Subclass API, where layers are
-  defined inside `__init__` and `call()` implements the computation.
-
-  Args:
-    generator_network: A transformer network for generator, this network should
-      output a sequence output and an optional classification output.
-    discriminator_network: A transformer network for discriminator, this network
-      should output a sequence output
-    vocab_size: Size of generator output vocabulary
-    num_classes: Number of classes to predict from the classification network
-      for the generator network (not used now)
-    num_token_predictions: Number of tokens to predict from the masked LM.
-    mlm_activation: The activation (if any) to use in the masked LM and
-      classification networks. If None, no activation will be used.
-    mlm_initializer: The initializer (if any) to use in the masked LM and
-      classification networks. Defaults to a Glorot uniform initializer.
-    output_type: The output style for this network. Can be either `logits` or
-      `predictions`.
-    disallow_correct: Whether to disallow the generator to generate the exact
-      same token in the original sentence
-  """
-
-  def __init__(self,
-               generator_network,
-               discriminator_network,
-               vocab_size,
-               num_classes,
-               num_token_predictions,
-               mlm_activation=None,
-               mlm_initializer='glorot_uniform',
-               output_type='logits',
-               disallow_correct=False,
-               **kwargs):
-    super(ElectraPretrainer, self).__init__()
-    self._config = {
-        'generator_network': generator_network,
-        'discriminator_network': discriminator_network,
-        'vocab_size': vocab_size,
-        'num_classes': num_classes,
-        'num_token_predictions': num_token_predictions,
-        'mlm_activation': mlm_activation,
-        'mlm_initializer': mlm_initializer,
-        'output_type': output_type,
-        'disallow_correct': disallow_correct,
-    }
-    for k, v in kwargs.items():
-      self._config[k] = v
-
-    self.generator_network = generator_network
-    self.discriminator_network = discriminator_network
-    self.vocab_size = vocab_size
-    self.num_classes = num_classes
-    self.num_token_predictions = num_token_predictions
-    self.mlm_activation = mlm_activation
-    self.mlm_initializer = mlm_initializer
-    self.output_type = output_type
-    self.disallow_correct = disallow_correct
-    self.masked_lm = layers.MaskedLM(
-        embedding_table=generator_network.get_embedding_table(),
-        activation=mlm_activation,
-        initializer=mlm_initializer,
-        output=output_type,
-        name='generator_masked_lm')
-    self.classification = layers.ClassificationHead(
-        inner_dim=generator_network.get_config()['hidden_size'],
-        num_classes=num_classes,
-        initializer=mlm_initializer,
-        name='generator_classification_head')
-    self.discriminator_projection = tf.keras.layers.Dense(
-        units=discriminator_network.get_config()['hidden_size'],
-        activation=mlm_activation,
-        kernel_initializer=mlm_initializer,
-        name='discriminator_projection_head')
-    self.discriminator_head = tf.keras.layers.Dense(
-        units=1, kernel_initializer=mlm_initializer)
-
-  def call(self, inputs):
-    """ELECTRA forward pass.
-
-    Args:
-      inputs: A dict of all inputs, same as the standard BERT model.
-
-    Returns:
-      outputs: A dict of pretrainer model outputs, including
-        (1) lm_outputs: A `[batch_size, num_token_predictions, vocab_size]`
-        tensor indicating logits on masked positions.
-        (2) sentence_outputs: A `[batch_size, num_classes]` tensor indicating
-        logits for nsp task.
-        (3) disc_logits: A `[batch_size, sequence_length]` tensor indicating
-        logits for discriminator replaced token detection task.
-        (4) disc_label: A `[batch_size, sequence_length]` tensor indicating
-        target labels for discriminator replaced token detection task.
-    """
-    input_word_ids = inputs['input_word_ids']
-    input_mask = inputs['input_mask']
-    input_type_ids = inputs['input_type_ids']
-    masked_lm_positions = inputs['masked_lm_positions']
-
-    ### Generator ###
-    sequence_output = self.generator_network(
-        [input_word_ids, input_mask, input_type_ids])['sequence_output']
-    # The generator encoder network may get outputs from all layers.
-    if isinstance(sequence_output, list):
-      sequence_output = sequence_output[-1]
-
-    lm_outputs = self.masked_lm(sequence_output, masked_lm_positions)
-    sentence_outputs = self.classification(sequence_output)
-
-    ### Sampling from generator ###
-    fake_data = self._get_fake_data(inputs, lm_outputs, duplicate=True)
-
-    ### Discriminator ###
-    disc_input = fake_data['inputs']
-    disc_label = fake_data['is_fake_tokens']
-    disc_sequence_output = self.discriminator_network([
-        disc_input['input_word_ids'], disc_input['input_mask'],
-        disc_input['input_type_ids']
-    ])['sequence_output']
-
-    # The discriminator encoder network may get outputs from all layers.
-    if isinstance(disc_sequence_output, list):
-      disc_sequence_output = disc_sequence_output[-1]
-
-    disc_logits = self.discriminator_head(
-        self.discriminator_projection(disc_sequence_output))
-    disc_logits = tf.squeeze(disc_logits, axis=-1)
-
-    outputs = {
-        'lm_outputs': lm_outputs,
-        'sentence_outputs': sentence_outputs,
-        'disc_logits': disc_logits,
-        'disc_label': disc_label,
-    }
-
-    return outputs
-
-  def _get_fake_data(self, inputs, mlm_logits, duplicate=True):
-    """Generate corrupted data for discriminator.
-
-    Args:
-      inputs: A dict of all inputs, same as the input of `call()` function
-      mlm_logits: The generator's output logits
-      duplicate: Whether to copy the original inputs dict during modifications
-
-    Returns:
-      A dict of generated fake data
-    """
-    inputs = unmask(inputs, duplicate)
-
-    if self.disallow_correct:
-      disallow = tf.one_hot(
-          inputs['masked_lm_ids'], depth=self.vocab_size, dtype=tf.float32)
-    else:
-      disallow = None
-
-    sampled_tokens = tf.stop_gradient(
-        sample_from_softmax(mlm_logits, disallow=disallow))
-    sampled_tokids = tf.argmax(sampled_tokens, -1, output_type=tf.int32)
-    updated_input_ids, masked = scatter_update(inputs['input_word_ids'],
-                                               sampled_tokids,
-                                               inputs['masked_lm_positions'])
-    labels = masked * (1 - tf.cast(
-        tf.equal(updated_input_ids, inputs['input_word_ids']), tf.int32))
-
-    updated_inputs = get_updated_inputs(
-        inputs, duplicate, input_word_ids=updated_input_ids)
-
-    return {
-        'inputs': updated_inputs,
-        'is_fake_tokens': labels,
-        'sampled_tokens': sampled_tokens
-    }
-
-  @property
-  def checkpoint_items(self):
-    """Returns a dictionary of items to be additionally checkpointed."""
-    items = dict(encoder=self.discriminator_network)
-    return items
-
-  def get_config(self):
-    return self._config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-
-def scatter_update(sequence, updates, positions):
-  """Scatter-update a sequence.
-
-  Args:
-    sequence: A `[batch_size, seq_len]` or `[batch_size, seq_len, depth]`
-      tensor.
-    updates: A tensor of size `batch_size*seq_len(*depth)`.
-    positions: A `[batch_size, n_positions]` tensor.
-
-  Returns:
-    updated_sequence: A `[batch_size, seq_len]` or
-      `[batch_size, seq_len, depth]` tensor of "sequence" with elements at
-      "positions" replaced by the values at "updates". Updates to index 0 are
-      ignored. If there are duplicated positions the update is only
-      applied once.
-    updates_mask: A `[batch_size, seq_len]` mask tensor of which inputs were
-      updated.
-  """
-  shape = tf_utils.get_shape_list(sequence, expected_rank=[2, 3])
-  depth_dimension = (len(shape) == 3)
-  if depth_dimension:
-    batch_size, seq_len, depth = shape
-  else:
-    batch_size, seq_len = shape
-    depth = 1
-    sequence = tf.expand_dims(sequence, -1)
-  n_positions = tf_utils.get_shape_list(positions)[1]
-
-  shift = tf.expand_dims(seq_len * tf.range(batch_size), -1)
-  flat_positions = tf.reshape(positions + shift, [-1, 1])
-  flat_updates = tf.reshape(updates, [-1, depth])
-  updates = tf.scatter_nd(flat_positions, flat_updates,
-                          [batch_size * seq_len, depth])
-  updates = tf.reshape(updates, [batch_size, seq_len, depth])
-
-  flat_updates_mask = tf.ones([batch_size * n_positions], tf.int32)
-  updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask,
-                               [batch_size * seq_len])
-  updates_mask = tf.reshape(updates_mask, [batch_size, seq_len])
-  not_first_token = tf.concat([
-      tf.zeros((batch_size, 1), tf.int32),
-      tf.ones((batch_size, seq_len - 1), tf.int32)
-  ], -1)
-  updates_mask *= not_first_token
-  updates_mask_3d = tf.expand_dims(updates_mask, -1)
-
-  # account for duplicate positions
-  if sequence.dtype == tf.float32:
-    updates_mask_3d = tf.cast(updates_mask_3d, tf.float32)
-    updates /= tf.maximum(1.0, updates_mask_3d)
-  else:
-    assert sequence.dtype == tf.int32
-    updates = tf.math.floordiv(updates, tf.maximum(1, updates_mask_3d))
-  updates_mask = tf.minimum(updates_mask, 1)
-  updates_mask_3d = tf.minimum(updates_mask_3d, 1)
-
-  updated_sequence = (((1 - updates_mask_3d) * sequence) +
-                      (updates_mask_3d * updates))
-  if not depth_dimension:
-    updated_sequence = tf.squeeze(updated_sequence, -1)
-
-  return updated_sequence, updates_mask
-
-
-def sample_from_softmax(logits, disallow=None):
-  """Implement softmax sampling using gumbel softmax trick.
-
-  Args:
-    logits: A `[batch_size, num_token_predictions, vocab_size]` tensor
-      indicating the generator output logits for each masked position.
-    disallow: If `None`, we directly sample tokens from the logits. Otherwise,
-      this is a tensor of size `[batch_size, num_token_predictions, vocab_size]`
-      indicating the true word id in each masked position.
-
-  Returns:
-    sampled_tokens: A `[batch_size, num_token_predictions, vocab_size]` one hot
-      tensor indicating the sampled word id in each masked position.
-  """
-  if disallow is not None:
-    logits -= 1000.0 * disallow
-  uniform_noise = tf.random.uniform(
-      tf_utils.get_shape_list(logits), minval=0, maxval=1)
-  gumbel_noise = -tf.math.log(-tf.math.log(uniform_noise + 1e-9) + 1e-9)
-
-  # Here we essentially follow the original paper and use temperature 1.0 for
-  # generator output logits.
-  sampled_tokens = tf.one_hot(
-      tf.argmax(tf.nn.softmax(logits + gumbel_noise), -1, output_type=tf.int32),
-      logits.shape[-1])
-  return sampled_tokens
-
-
-def unmask(inputs, duplicate):
-  unmasked_input_word_ids, _ = scatter_update(inputs['input_word_ids'],
-                                              inputs['masked_lm_ids'],
-                                              inputs['masked_lm_positions'])
-  return get_updated_inputs(
-      inputs, duplicate, input_word_ids=unmasked_input_word_ids)
-
-
-def get_updated_inputs(inputs, duplicate, **kwargs):
-  if duplicate:
-    new_inputs = copy.copy(inputs)
-  else:
-    new_inputs = inputs
-  for k, v in kwargs.items():
-    new_inputs[k] = v
-  return new_inputs
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/electra_pretrainer_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/electra_pretrainer_test.py
deleted file mode 100644
index cad2f2b401a39cd4420a1fea30d55d20dc4994aa..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/electra_pretrainer_test.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for ELECTRA pre trainer network."""
-
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling import networks
-from official.nlp.modeling.models import electra_pretrainer
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class ElectraPretrainerTest(keras_parameterized.TestCase):
-
-  def test_electra_pretrainer(self):
-    """Validate that the Keras object can be created."""
-    # Build a transformer network to use within the ELECTRA trainer.
-    vocab_size = 100
-    sequence_length = 512
-    test_generator_network = networks.BertEncoder(
-        vocab_size=vocab_size,
-        num_layers=2,
-        max_sequence_length=sequence_length,
-        dict_outputs=True)
-    test_discriminator_network = networks.BertEncoder(
-        vocab_size=vocab_size,
-        num_layers=2,
-        max_sequence_length=sequence_length,
-        dict_outputs=True)
-
-    # Create a ELECTRA trainer with the created network.
-    num_classes = 3
-    num_token_predictions = 2
-    eletrca_trainer_model = electra_pretrainer.ElectraPretrainer(
-        generator_network=test_generator_network,
-        discriminator_network=test_discriminator_network,
-        vocab_size=vocab_size,
-        num_classes=num_classes,
-        num_token_predictions=num_token_predictions,
-        disallow_correct=True)
-
-    # Create a set of 2-dimensional inputs (the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    lm_positions = tf.keras.Input(
-        shape=(num_token_predictions,), dtype=tf.int32)
-    lm_ids = tf.keras.Input(shape=(num_token_predictions,), dtype=tf.int32)
-    inputs = {
-        'input_word_ids': word_ids,
-        'input_mask': mask,
-        'input_type_ids': type_ids,
-        'masked_lm_positions': lm_positions,
-        'masked_lm_ids': lm_ids
-    }
-
-    # Invoke the trainer model on the inputs. This causes the layer to be built.
-    outputs = eletrca_trainer_model(inputs)
-    lm_outs = outputs['lm_outputs']
-    cls_outs = outputs['sentence_outputs']
-    disc_logits = outputs['disc_logits']
-    disc_label = outputs['disc_label']
-
-    # Validate that the outputs are of the expected shape.
-    expected_lm_shape = [None, num_token_predictions, vocab_size]
-    expected_classification_shape = [None, num_classes]
-    expected_disc_logits_shape = [None, sequence_length]
-    expected_disc_label_shape = [None, sequence_length]
-    self.assertAllEqual(expected_lm_shape, lm_outs.shape.as_list())
-    self.assertAllEqual(expected_classification_shape, cls_outs.shape.as_list())
-    self.assertAllEqual(expected_disc_logits_shape, disc_logits.shape.as_list())
-    self.assertAllEqual(expected_disc_label_shape, disc_label.shape.as_list())
-
-  def test_electra_trainer_tensor_call(self):
-    """Validate that the Keras object can be invoked."""
-    # Build a transformer network to use within the ELECTRA trainer. (Here, we
-    # use a short sequence_length for convenience.)
-    test_generator_network = networks.BertEncoder(
-        vocab_size=100, num_layers=4, max_sequence_length=3, dict_outputs=True)
-    test_discriminator_network = networks.BertEncoder(
-        vocab_size=100, num_layers=4, max_sequence_length=3, dict_outputs=True)
-
-    # Create a ELECTRA trainer with the created network.
-    eletrca_trainer_model = electra_pretrainer.ElectraPretrainer(
-        generator_network=test_generator_network,
-        discriminator_network=test_discriminator_network,
-        vocab_size=100,
-        num_classes=2,
-        sequence_length=3,
-        num_token_predictions=2)
-
-    # Create a set of 2-dimensional data tensors to feed into the model.
-    word_ids = tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.int32)
-    mask = tf.constant([[1, 1, 1], [1, 0, 0]], dtype=tf.int32)
-    type_ids = tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.int32)
-    lm_positions = tf.constant([[0, 1], [0, 2]], dtype=tf.int32)
-    lm_ids = tf.constant([[10, 20], [20, 30]], dtype=tf.int32)
-    inputs = {
-        'input_word_ids': word_ids,
-        'input_mask': mask,
-        'input_type_ids': type_ids,
-        'masked_lm_positions': lm_positions,
-        'masked_lm_ids': lm_ids
-    }
-
-    # Invoke the trainer model on the tensors. In Eager mode, this does the
-    # actual calculation. (We can't validate the outputs, since the network is
-    # too complex: this simply ensures we're not hitting runtime errors.)
-    _ = eletrca_trainer_model(inputs)
-
-  def test_serialize_deserialize(self):
-    """Validate that the ELECTRA trainer can be serialized and deserialized."""
-    # Build a transformer network to use within the BERT trainer. (Here, we use
-    # a short sequence_length for convenience.)
-    test_generator_network = networks.BertEncoder(
-        vocab_size=100, num_layers=4, max_sequence_length=3)
-    test_discriminator_network = networks.BertEncoder(
-        vocab_size=100, num_layers=4, max_sequence_length=3)
-
-    # Create a ELECTRA trainer with the created network. (Note that all the args
-    # are different, so we can catch any serialization mismatches.)
-    electra_trainer_model = electra_pretrainer.ElectraPretrainer(
-        generator_network=test_generator_network,
-        discriminator_network=test_discriminator_network,
-        vocab_size=100,
-        num_classes=2,
-        sequence_length=3,
-        num_token_predictions=2)
-
-    # Create another BERT trainer via serialization and deserialization.
-    config = electra_trainer_model.get_config()
-    new_electra_trainer_model = electra_pretrainer.ElectraPretrainer.from_config(
-        config)
-
-    # Validate that the config can be forced to JSON.
-    _ = new_electra_trainer_model.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(electra_trainer_model.get_config(),
-                        new_electra_trainer_model.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/seq2seq_transformer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/seq2seq_transformer.py
deleted file mode 100644
index 9aa9eb0d33d6eaf4e79ca37a040e7dfebfc57fb9..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/seq2seq_transformer.py
+++ /dev/null
@@ -1,605 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Implement Seq2Seq Transformer model by TF official NLP library.
-
-Model paper: https://arxiv.org/pdf/1706.03762.pdf
-"""
-import math
-
-import tensorflow as tf
-from official.modeling import tf_utils
-from official.nlp import keras_nlp
-from official.nlp.modeling import layers
-from official.nlp.modeling.ops import beam_search
-
-EOS_ID = 1
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class Seq2SeqTransformer(tf.keras.Model):
-  """Transformer model with Keras.
-
-  Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
-
-  The Transformer model consists of an encoder and decoder. The input is an int
-  sequence (or a batch of sequences). The encoder produces a continuous
-  representation, and the decoder uses the encoder output to generate
-  probabilities for the output sequence.
-  """
-
-  def __init__(self,
-               vocab_size=33708,
-               embedding_width=512,
-               dropout_rate=0.0,
-               padded_decode=False,
-               decode_max_length=None,
-               extra_decode_length=0,
-               beam_size=4,
-               alpha=0.6,
-               encoder_layer=None,
-               decoder_layer=None,
-               eos_id=EOS_ID,
-               **kwargs):
-    """Initialize layers to build Transformer model.
-
-    Args:
-      vocab_size: Size of vocabulary.
-      embedding_width: Size of hidden layer for embedding.
-      dropout_rate: Dropout probability.
-      padded_decode: Whether to max_sequence_length padding is used. If set
-        False, max_sequence_length padding is not used.
-      decode_max_length: maximum number of steps to decode a sequence.
-      extra_decode_length: Beam search will run extra steps to decode.
-      beam_size: Number of beams for beam search
-      alpha: The strength of length normalization for beam search.
-      encoder_layer: An initialized encoder layer.
-      decoder_layer: An initialized decoder layer.
-      eos_id: Id of end of sentence token.
-      **kwargs: other keyword arguments.
-    """
-    super().__init__(**kwargs)
-    self._vocab_size = vocab_size
-    self._embedding_width = embedding_width
-    self._dropout_rate = dropout_rate
-    self._padded_decode = padded_decode
-    self._decode_max_length = decode_max_length
-    self._extra_decode_length = extra_decode_length
-    self._beam_size = beam_size
-    self._alpha = alpha
-    self._eos_id = eos_id
-    self.embedding_lookup = keras_nlp.layers.OnDeviceEmbedding(
-        vocab_size=self._vocab_size,
-        embedding_width=self._embedding_width,
-        initializer=tf.random_normal_initializer(
-            mean=0., stddev=self._embedding_width**-0.5),
-        scale_factor=self._embedding_width**0.5)
-    self.encoder_layer = encoder_layer
-    self.decoder_layer = decoder_layer
-    self.position_embedding = layers.RelativePositionEmbedding(
-        hidden_size=self._embedding_width)
-    self.encoder_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-    self.decoder_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-
-  def get_config(self):
-    config = {
-        "vocab_size": self._vocab_size,
-        "hidden_size": self._embedding_width,
-        "dropout_rate": self._dropout_rate,
-        "padded_decode": self._padded_decode,
-        "decode_max_length": self._decode_max_length,
-        "eos_id": self._eos_id,
-        "extra_decode_length": self._extra_decode_length,
-        "beam_size": self._beam_size,
-        "alpha": self._alpha,
-        "encoder_layer": self.encoder_layer,
-        "decoder_layer": self.decoder_layer
-    }
-    base_config = super(Seq2SeqTransformer, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def _embedding_linear(self, embedding_matrix, x):
-    """Uses embeddings as linear transformation weights."""
-    batch_size = tf.shape(x)[0]
-    length = tf.shape(x)[1]
-    hidden_size = tf.shape(x)[2]
-    vocab_size = tf.shape(embedding_matrix)[0]
-
-    x = tf.reshape(x, [-1, hidden_size])
-    logits = tf.matmul(x, tf.cast(embedding_matrix, x.dtype), transpose_b=True)
-
-    return tf.reshape(logits, [batch_size, length, vocab_size])
-
-  def call(self, inputs):
-    """Calculate target logits or inferred target sequences.
-
-    Args:
-      inputs: a dictionary of tensors.
-        Feature `inputs`: int tensor with shape `[batch_size, input_length]`.
-        Feature `targets` (optional): None or int tensor with shape
-          `[batch_size, target_length]`.
-
-    Returns:
-      If targets is defined, then return logits for each word in the target
-      sequence, which is a float tensor with shape
-      `(batch_size, target_length, vocab_size)`. If target is `None`, then
-      generate output sequence one token at a time and
-      returns a dictionary {
-          outputs: `(batch_size, decoded_length)`
-          scores: `(batch_size, 1)`}
-      Even when `float16` is used, the output tensor(s) are always `float32`.
-
-    Raises:
-      NotImplementedError: If try to use padded decode method on CPU/GPUs.
-    """
-    sources = inputs["inputs"]
-    targets = inputs.get("targets", None)
-    # Prepare inputs to the layer stack by adding positional encodings and
-    # applying dropout.
-    embedded_inputs = self.embedding_lookup(sources)
-    embedding_mask = tf.cast(tf.not_equal(sources, 0), embedded_inputs.dtype)
-    embedded_inputs *= tf.expand_dims(embedding_mask, -1)
-    # Attention_mask generation.
-    input_shape = tf_utils.get_shape_list(sources, expected_rank=2)
-    attention_mask = tf.cast(
-        tf.reshape(
-            tf.not_equal(sources, 0), [input_shape[0], 1, input_shape[1]]),
-        dtype=sources.dtype)
-    broadcast_ones = tf.ones(
-        shape=[input_shape[0], input_shape[1], 1], dtype=sources.dtype)
-    attention_mask = broadcast_ones * attention_mask
-
-    pos_encoding = self.position_embedding(embedded_inputs)
-    pos_encoding = tf.cast(pos_encoding, embedded_inputs.dtype)
-    encoder_inputs = embedded_inputs + pos_encoding
-
-    encoder_inputs = self.encoder_dropout(encoder_inputs)
-
-    encoder_outputs = self.encoder_layer(
-        encoder_inputs, attention_mask=attention_mask)
-
-    if targets is None:
-      if self._padded_decode:
-        max_decode_length = self._decode_max_length
-      else:
-        max_decode_length = self._decode_max_length or (
-            tf.shape(encoder_outputs)[1] + self._extra_decode_length)
-      symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length)
-
-      batch_size = tf.shape(encoder_outputs)[0]
-      # Create initial set of IDs that will be passed to symbols_to_logits_fn.
-      initial_ids = tf.zeros([batch_size], dtype=tf.int32)
-
-      # Create cache storing decoder attention values for each layer.
-      init_decode_length = (max_decode_length if self._padded_decode else 0)
-      num_heads = self.decoder_layer.num_attention_heads
-      dim_per_head = self._embedding_width // num_heads
-
-      # Cache dtype needs to match beam_search dtype.
-      # pylint: disable=g-complex-comprehension
-      cache = {
-          str(layer): {
-              "key":
-                  tf.zeros(
-                      [batch_size, init_decode_length, num_heads, dim_per_head],
-                      dtype=self.compute_dtype),
-              "value":
-                  tf.zeros(
-                      [batch_size, init_decode_length, num_heads, dim_per_head],
-                      dtype=self.compute_dtype)
-          } for layer in range(self.decoder_layer.num_layers)
-      }
-      # pylint: enable=g-complex-comprehension
-
-      # Add encoder output and attention bias to the cache.
-      encoder_outputs = tf.cast(encoder_outputs, dtype=self.compute_dtype)
-      attention_mask = tf.cast(
-          tf.reshape(
-              tf.not_equal(sources, 0), [input_shape[0], 1, input_shape[1]]),
-          dtype=self.compute_dtype)
-      cache["encoder_outputs"] = encoder_outputs
-      cache["encoder_decoder_attention_mask"] = attention_mask
-
-      # Use beam search to find the top beam_size sequences and scores.
-      decoded_ids, scores = beam_search.sequence_beam_search(
-          symbols_to_logits_fn=symbols_to_logits_fn,
-          initial_ids=initial_ids,
-          initial_cache=cache,
-          vocab_size=self._vocab_size,
-          beam_size=self._beam_size,
-          alpha=self._alpha,
-          max_decode_length=max_decode_length,
-          eos_id=self._eos_id,
-          padded_decode=self._padded_decode,
-          dtype=self.compute_dtype)
-
-      # Get the top sequence for each batch element
-      top_decoded_ids = decoded_ids[:, 0, 1:]
-      top_scores = scores[:, 0]
-
-      return {"outputs": top_decoded_ids, "scores": top_scores}
-
-    decoder_inputs = self.embedding_lookup(targets)
-    embedding_mask = tf.cast(tf.not_equal(targets, 0), decoder_inputs.dtype)
-    decoder_inputs *= tf.expand_dims(embedding_mask, -1)
-    # Shift targets to the right, and remove the last element
-    decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
-    length = tf.shape(decoder_inputs)[1]
-    pos_encoding = self.position_embedding(decoder_inputs)
-    pos_encoding = tf.cast(pos_encoding, embedded_inputs.dtype)
-    decoder_inputs += pos_encoding
-
-    decoder_inputs = self.decoder_dropout(decoder_inputs)
-
-    decoder_shape = tf_utils.get_shape_list(decoder_inputs, expected_rank=3)
-    batch_size = decoder_shape[0]
-    decoder_length = decoder_shape[1]
-
-    self_attention_mask = tf.linalg.band_part(tf.ones([length, length]), -1, 0)
-    self_attention_mask = tf.reshape(self_attention_mask, [1, length, length])
-    self_attention_mask = tf.tile(self_attention_mask, [batch_size, 1, 1])
-
-    attention_mask = tf.cast(
-        tf.expand_dims(tf.not_equal(sources, 0), axis=1), dtype=sources.dtype)
-    attention_mask = tf.tile(attention_mask, [1, decoder_length, 1])
-
-    outputs = self.decoder_layer(
-        decoder_inputs,
-        encoder_outputs,
-        self_attention_mask=self_attention_mask,
-        cross_attention_mask=attention_mask)
-    logits = self._embedding_linear(self.embedding_lookup.embeddings, outputs)
-    # Model outputs should be float32 to avoid numeric issues.
-    # https://www.tensorflow.org/guide/mixed_precision#building_the_model
-    logits = tf.cast(logits, tf.float32)
-    return logits
-
-  def _get_symbols_to_logits_fn(self, max_decode_length):
-    """Returns a decoding function that calculates logits of the next tokens."""
-    timing_signal = self.position_embedding(
-        inputs=None, length=max_decode_length + 1)
-    timing_signal = tf.cast(timing_signal, dtype=self.compute_dtype)
-    decoder_self_attention_mask = tf.linalg.band_part(
-        tf.ones([max_decode_length, max_decode_length],
-                dtype=self.compute_dtype), -1, 0)
-    decoder_self_attention_mask = tf.reshape(
-        decoder_self_attention_mask, [1, max_decode_length, max_decode_length])
-
-    def symbols_to_logits_fn(ids, i, cache):
-      """Generate logits for next potential IDs.
-
-      Args:
-        ids: Current decoded sequences. int tensor with shape `(batch_size *
-          beam_size, i + 1)`.
-        i: Loop index.
-        cache: Dictionary of values storing the encoder output, encoder-decoder
-          attention bias, and previous decoder attention values.
-
-      Returns:
-        Tuple of
-          (logits with shape `(batch_size * beam_size, vocab_size)`,
-           updated cache values)
-      """
-      # Set decoder input to the last generated IDs
-      decoder_input = ids[:, -1:]
-
-      # Preprocess decoder input by getting embeddings and adding timing signal.
-      # decoder_input = self.embedding_softmax_layer(decoder_input)
-      source_decoder_input = decoder_input
-      decoder_input = self.embedding_lookup(decoder_input)
-      embedding_mask = tf.cast(
-          tf.not_equal(source_decoder_input, 0), decoder_input.dtype)
-      decoder_input *= tf.expand_dims(embedding_mask, -1)
-      decoder_input += timing_signal[i]
-      if self._padded_decode:
-        # indexing does not work on TPU.
-        bias_shape = decoder_self_attention_mask.shape.as_list()
-        self_attention_mask = tf.slice(decoder_self_attention_mask, [0, i, 0],
-                                       [bias_shape[0], 1, bias_shape[2]])
-      else:
-        self_attention_mask = decoder_self_attention_mask[:, i:i + 1, :i + 1]
-      decoder_shape = tf_utils.get_shape_list(decoder_input, expected_rank=3)
-      batch_size = decoder_shape[0]
-      decoder_length = decoder_shape[1]
-
-      self_attention_mask = tf.tile(self_attention_mask, [batch_size, 1, 1])
-      attention_mask = cache.get("encoder_decoder_attention_mask")
-      attention_mask = tf.tile(attention_mask, [1, decoder_length, 1])
-
-      decoder_outputs = self.decoder_layer(
-          decoder_input,
-          cache.get("encoder_outputs"),
-          self_attention_mask=self_attention_mask,
-          cross_attention_mask=attention_mask,
-          cache=cache,
-          decode_loop_step=i if self._padded_decode else None)
-
-      decoder_outputs = tf.cast(decoder_outputs, dtype=self.compute_dtype)
-      logits = self._embedding_linear(self.embedding_lookup.embeddings,
-                                      decoder_outputs)
-      logits = tf.squeeze(logits, axis=[1])
-      return logits, cache
-
-    return symbols_to_logits_fn
-
-
-class TransformerEncoder(tf.keras.layers.Layer):
-  """Transformer encoder.
-
-  Transformer encoder is made up of N identical layers. Each layer is composed
-  of the sublayers:
-    1. Self-attention layer
-    2. Feedforward network (which is 2 fully-connected layers)
-  """
-
-  def __init__(self,
-               num_layers=6,
-               num_attention_heads=8,
-               intermediate_size=2048,
-               activation="relu",
-               dropout_rate=0.0,
-               attention_dropout_rate=0.0,
-               use_bias=False,
-               norm_first=True,
-               norm_epsilon=1e-6,
-               intermediate_dropout=0.0,
-               **kwargs):
-    """Initialize a Transformer encoder.
-
-    Args:
-      num_layers: Number of layers.
-      num_attention_heads: Number of attention heads.
-      intermediate_size: Size of the intermediate (Feedforward) layer.
-      activation: Activation for the intermediate layer.
-      dropout_rate: Dropout probability.
-      attention_dropout_rate: Dropout probability for attention layers.
-      use_bias: Whether to enable use_bias in attention layer. If set False,
-        use_bias in attention layer is disabled.
-      norm_first: Whether to normalize inputs to attention and intermediate
-        dense layers. If set False, output of attention and intermediate dense
-        layers is normalized.
-      norm_epsilon: Epsilon value to initialize normalization layers.
-      intermediate_dropout: Dropout probability for intermediate_dropout_layer.
-      **kwargs: key word arguemnts passed to tf.keras.layers.Layer.
-    """
-
-    super(TransformerEncoder, self).__init__(**kwargs)
-    self.num_layers = num_layers
-    self.num_attention_heads = num_attention_heads
-    self._intermediate_size = intermediate_size
-    self._activation = activation
-    self._dropout_rate = dropout_rate
-    self._attention_dropout_rate = attention_dropout_rate
-    self._use_bias = use_bias
-    self._norm_first = norm_first
-    self._norm_epsilon = norm_epsilon
-    self._intermediate_dropout = intermediate_dropout
-
-  def build(self, input_shape):
-    """Implements build() for the layer."""
-    self.encoder_layers = []
-    for i in range(self.num_layers):
-      self.encoder_layers.append(
-          keras_nlp.layers.TransformerEncoderBlock(
-              num_attention_heads=self.num_attention_heads,
-              inner_dim=self._intermediate_size,
-              inner_activation=self._activation,
-              output_dropout=self._dropout_rate,
-              attention_dropout=self._attention_dropout_rate,
-              use_bias=self._use_bias,
-              norm_first=self._norm_first,
-              norm_epsilon=self._norm_epsilon,
-              inner_dropout=self._intermediate_dropout,
-              attention_initializer=attention_initializer(input_shape[2]),
-              name=("layer_%d" % i)))
-    self.output_normalization = tf.keras.layers.LayerNormalization(
-        epsilon=self._norm_epsilon, dtype="float32")
-    super(TransformerEncoder, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        "num_layers": self.num_layers,
-        "num_attention_heads": self.num_attention_heads,
-        "intermediate_size": self._intermediate_size,
-        "activation": self._activation,
-        "dropout_rate": self._dropout_rate,
-        "attention_dropout_rate": self._attention_dropout_rate,
-        "use_bias": self._use_bias,
-        "norm_first": self._norm_first,
-        "norm_epsilon": self._norm_epsilon,
-        "intermediate_dropout": self._intermediate_dropout
-    }
-    base_config = super(TransformerEncoder, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self, encoder_inputs, attention_mask=None):
-    """Return the output of the encoder.
-
-    Args:
-      encoder_inputs: A tensor with shape `(batch_size, input_length,
-        hidden_size)`.
-      attention_mask: A mask for the encoder self-attention layer with shape
-        `(batch_size, input_length, input_length)`.
-
-    Returns:
-      Output of encoder which is a `float32` tensor with shape
-        `(batch_size, input_length, hidden_size)`.
-    """
-    for layer_idx in range(self.num_layers):
-      encoder_inputs = self.encoder_layers[layer_idx](
-          [encoder_inputs, attention_mask])
-
-    output_tensor = encoder_inputs
-    output_tensor = self.output_normalization(output_tensor)
-
-    return output_tensor
-
-
-class TransformerDecoder(tf.keras.layers.Layer):
-  """Transformer decoder.
-
-  Like the encoder, the decoder is made up of N identical layers.
-  Each layer is composed of the sublayers:
-    1. Self-attention layer
-    2. Multi-headed attention layer combining encoder outputs with results from
-       the previous self-attention layer.
-    3. Feedforward network (2 fully-connected layers)
-  """
-
-  def __init__(self,
-               num_layers=6,
-               num_attention_heads=8,
-               intermediate_size=2048,
-               activation="relu",
-               dropout_rate=0.0,
-               attention_dropout_rate=0.0,
-               use_bias=False,
-               norm_first=True,
-               norm_epsilon=1e-6,
-               intermediate_dropout=0.0,
-               **kwargs):
-    """Initialize a Transformer decoder.
-
-    Args:
-      num_layers: Number of layers.
-      num_attention_heads: Number of attention heads.
-      intermediate_size: Size of the intermediate (Feedforward) layer.
-      activation: Activation for the intermediate layer.
-      dropout_rate: Dropout probability.
-      attention_dropout_rate: Dropout probability for attention layers.
-      use_bias: Whether to enable use_bias in attention layer. If set `False`,
-        use_bias in attention layer is disabled.
-      norm_first: Whether to normalize inputs to attention and intermediate
-        dense layers. If set `False`, output of attention and intermediate dense
-        layers is normalized.
-      norm_epsilon: Epsilon value to initialize normalization layers.
-      intermediate_dropout: Dropout probability for intermediate_dropout_layer.
-      **kwargs: key word arguemnts passed to tf.keras.layers.Layer.
-    """
-    super(TransformerDecoder, self).__init__(**kwargs)
-    self.num_layers = num_layers
-    self.num_attention_heads = num_attention_heads
-    self._intermediate_size = intermediate_size
-    self._activation = activation
-    self._dropout_rate = dropout_rate
-    self._attention_dropout_rate = attention_dropout_rate
-    self._use_bias = use_bias
-    self._norm_first = norm_first
-    self._norm_epsilon = norm_epsilon
-    self._intermediate_dropout = intermediate_dropout
-
-  def build(self, input_shape):
-    """Implements build() for the layer."""
-    self.decoder_layers = []
-    for i in range(self.num_layers):
-      self.decoder_layers.append(
-          layers.TransformerDecoderBlock(
-              num_attention_heads=self.num_attention_heads,
-              intermediate_size=self._intermediate_size,
-              intermediate_activation=self._activation,
-              dropout_rate=self._dropout_rate,
-              attention_dropout_rate=self._attention_dropout_rate,
-              use_bias=self._use_bias,
-              norm_first=self._norm_first,
-              norm_epsilon=self._norm_epsilon,
-              intermediate_dropout=self._intermediate_dropout,
-              attention_initializer=attention_initializer(input_shape[2]),
-              name=("layer_%d" % i)))
-    self.output_normalization = tf.keras.layers.LayerNormalization(
-        epsilon=1e-6, dtype="float32")
-    super(TransformerDecoder, self).build(input_shape)
-
-  def get_config(self):
-    config = {
-        "num_layers": self.num_layers,
-        "num_attention_heads": self.num_attention_heads,
-        "intermediate_size": self._intermediate_size,
-        "activation": self._activation,
-        "dropout_rate": self._dropout_rate,
-        "attention_dropout_rate": self._attention_dropout_rate,
-        "use_bias": self._use_bias,
-        "norm_first": self._norm_first,
-        "norm_epsilon": self._norm_epsilon,
-        "intermediate_dropout": self._intermediate_dropout
-    }
-    base_config = super(TransformerDecoder, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def call(self,
-           target,
-           memory,
-           self_attention_mask=None,
-           cross_attention_mask=None,
-           cache=None,
-           decode_loop_step=None):
-    """Return the output of the decoder layer stacks.
-
-    Args:
-      target: A tensor with shape `(batch_size, target_length, hidden_size)`.
-      memory: A tensor with shape `(batch_size, input_length, hidden_size)`.
-      self_attention_mask: A tensor with shape `(batch_size, target_len,
-        target_length)`, the mask for decoder self-attention layer.
-      cross_attention_mask: A tensor with shape `(batch_size, target_length,
-        input_length)` which is the mask for encoder-decoder attention layer.
-      cache: (Used for fast decoding) A nested dictionary storing previous
-        decoder self-attention values. The items are:
-        {layer_n: {"k": A tensor with shape `(batch_size, i, key_channels)`,
-                   "v": A tensor with shape `(batch_size, i, value_channels)`},
-                     ...}
-      decode_loop_step: An integer, the step number of the decoding loop. Used
-        only for autoregressive inference on TPU.
-
-    Returns:
-      Output of decoder.
-      float32 tensor with shape `(batch_size, target_length, hidden_size`).
-    """
-
-    output_tensor = target
-    for layer_idx in range(self.num_layers):
-      transformer_inputs = [
-          output_tensor, memory, cross_attention_mask, self_attention_mask
-      ]
-      # Gets the cache for decoding.
-      if cache is None:
-        output_tensor, _ = self.decoder_layers[layer_idx](transformer_inputs)
-      else:
-        cache_layer_idx = str(layer_idx)
-        output_tensor, cache[cache_layer_idx] = self.decoder_layers[layer_idx](
-            transformer_inputs,
-            cache=cache[cache_layer_idx],
-            decode_loop_step=decode_loop_step)
-    return self.output_normalization(output_tensor)
-
-
-def attention_initializer(hidden_size):
-  """Initializer for attention layers in Seq2SeqTransformer."""
-  hidden_size = int(hidden_size)
-  limit = math.sqrt(6.0 / (hidden_size + hidden_size))
-  return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/seq2seq_transformer_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/seq2seq_transformer_test.py
deleted file mode 100644
index f7f58d042df10632e4bcffd2771c527f2cf90323..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/seq2seq_transformer_test.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Test Transformer model."""
-
-from absl import logging
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.nlp.modeling.models import seq2seq_transformer
-
-
-class Seq2SeqTransformerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _build_model(self, padded_decode, decode_max_length):
-    num_layers = 1
-    num_attention_heads = 2
-    intermediate_size = 32
-    vocab_size = 100
-    embedding_width = 16
-    encdec_kwargs = dict(
-        num_layers=num_layers,
-        num_attention_heads=num_attention_heads,
-        intermediate_size=intermediate_size,
-        activation="relu",
-        dropout_rate=0.01,
-        attention_dropout_rate=0.01,
-        use_bias=False,
-        norm_first=True,
-        norm_epsilon=1e-6,
-        intermediate_dropout=0.01)
-    encoder_layer = seq2seq_transformer.TransformerEncoder(**encdec_kwargs)
-    decoder_layer = seq2seq_transformer.TransformerDecoder(**encdec_kwargs)
-
-    return seq2seq_transformer.Seq2SeqTransformer(
-        vocab_size=vocab_size,
-        embedding_width=embedding_width,
-        dropout_rate=0.01,
-        padded_decode=padded_decode,
-        decode_max_length=decode_max_length,
-        beam_size=4,
-        alpha=0.6,
-        encoder_layer=encoder_layer,
-        decoder_layer=decoder_layer)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=[
-              strategy_combinations.default_strategy,
-              strategy_combinations.cloud_tpu_strategy,
-          ],
-          mode="eager"))
-  def test_create_model_with_ds(self, distribution):
-    with distribution.scope():
-      padded_decode = isinstance(
-          distribution,
-          (tf.distribute.TPUStrategy, tf.distribute.experimental.TPUStrategy))
-      decode_max_length = 10
-      batch_size = 4
-      model = self._build_model(padded_decode, decode_max_length)
-
-      @tf.function
-      def step(inputs):
-
-        def _step_fn(inputs):
-          return model(inputs)
-
-        outputs = distribution.run(_step_fn, args=(inputs,))
-        return tf.nest.map_structure(distribution.experimental_local_results,
-                                     outputs)
-
-      fake_inputs = dict(
-          inputs=np.zeros((batch_size, decode_max_length), dtype=np.int32))
-      local_outputs = step(fake_inputs)
-      logging.info("local_outputs=%s", local_outputs)
-      self.assertEqual(local_outputs["outputs"][0].shape, (4, 10))
-
-      fake_inputs = dict(
-          inputs=np.zeros((batch_size, decode_max_length), dtype=np.int32),
-          targets=np.zeros((batch_size, 8), dtype=np.int32))
-      local_outputs = step(fake_inputs)
-      logging.info("local_outputs=%s", local_outputs)
-      self.assertEqual(local_outputs[0].shape, (4, 8, 100))
-
-  @parameterized.parameters(True, False)
-  def test_create_savedmodel(self, padded_decode):
-    decode_max_length = 10
-    model = self._build_model(padded_decode, decode_max_length)
-
-    class SaveModule(tf.Module):
-
-      def __init__(self, model):
-        super(SaveModule, self).__init__()
-        self.model = model
-
-      @tf.function
-      def serve(self, inputs):
-        return self.model.call(dict(inputs=inputs))
-
-    save_module = SaveModule(model)
-    if padded_decode:
-      tensor_shape = (4, 10)
-    else:
-      tensor_shape = (None, None)
-    signatures = dict(
-        serving_default=save_module.serve.get_concrete_function(
-            tf.TensorSpec(shape=tensor_shape, dtype=tf.int32, name="inputs")))
-    tf.saved_model.save(save_module, self.get_temp_dir(), signatures=signatures)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/xlnet.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/xlnet.py
deleted file mode 100644
index 9df59d8bce75b43112b15b86f0da56a0502490d1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/xlnet.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""XLNet models."""
-# pylint: disable=g-classes-have-attributes
-
-from typing import Any, Mapping, Union
-
-import tensorflow as tf
-
-from official.nlp.modeling import layers
-from official.nlp.modeling import networks
-
-
-class XLNetMaskedLM(tf.keras.layers.Layer):
-  """XLNet pretraining head."""
-
-  def __init__(self,
-               vocab_size: int,
-               hidden_size: int,
-               initializer: str = 'glorot_uniform',
-               activation: str = 'gelu',
-               name=None,
-               **kwargs):
-    super().__init__(name=name, **kwargs)
-    self._vocab_size = vocab_size
-    self._hidden_size = hidden_size
-    self._initializer = initializer
-    self._activation = activation
-
-  def build(self, input_shape):
-    self.dense = tf.keras.layers.Dense(
-        units=self._hidden_size,
-        activation=self._activation,
-        kernel_initializer=self._initializer,
-        name='transform/dense')
-    self.layer_norm = tf.keras.layers.LayerNormalization(
-        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
-    self.bias = self.add_weight(
-        'output_bias/bias',
-        shape=(self._vocab_size,),
-        initializer='zeros',
-        trainable=True)
-    super().build(input_shape)
-
-  def call(self,
-           sequence_data: tf.Tensor,
-           embedding_table: tf.Tensor):
-    lm_data = self.dense(sequence_data)
-    lm_data = self.layer_norm(lm_data)
-    lm_data = tf.matmul(lm_data, embedding_table, transpose_b=True)
-    logits = tf.nn.bias_add(lm_data, self.bias)
-    return logits
-
-  def get_config(self) -> Mapping[str, Any]:
-    config = {
-        'vocab_size':
-            self._vocab_size,
-        'hidden_size':
-            self._hidden_size,
-        'initializer':
-            self._initializer
-    }
-    base_config = super(XLNetMaskedLM, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class XLNetPretrainer(tf.keras.Model):
-  """XLNet-based pretrainer.
-
-  This is an implementation of the network structure surrounding a
-  Transformer-XL encoder as described in "XLNet: Generalized Autoregressive
-  Pretraining for Language Understanding" (https://arxiv.org/abs/1906.08237).
-
-  Args:
-    network: An XLNet/Transformer-XL based network. This network should output a
-      sequence output and list of `state` tensors.
-    mlm_activation: The activation (if any) to use in the Masked LM network. If
-      None, then no activation will be used.
-    mlm_initializer: The initializer (if any) to use in the masked LM. Defaults
-      to a Glorot uniform initializer.
-
-  """
-
-  def __init__(
-      self,
-      network: Union[tf.keras.layers.Layer, tf.keras.Model],
-      mlm_activation=None,
-      mlm_initializer='glorot_uniform',
-      name: str = None,
-      **kwargs):
-    super().__init__(name=name, **kwargs)
-    self._config = {
-        'network': network,
-        'mlm_activation': mlm_activation,
-        'mlm_initializer': mlm_initializer,
-    }
-    self._network = network
-    self._hidden_size = network.get_config()['hidden_size']
-    self._vocab_size = network.get_config()['vocab_size']
-    self._activation = mlm_activation
-    self._initializer = mlm_initializer
-    self._masked_lm = XLNetMaskedLM(
-        vocab_size=self._vocab_size,
-        hidden_size=self._hidden_size,
-        initializer=self._initializer)
-
-  def call(self, inputs: Mapping[str, Any]):
-    input_word_ids = inputs['input_word_ids']
-    input_type_ids = inputs['input_type_ids']
-    masked_tokens = inputs['masked_tokens']
-    permutation_mask = inputs['permutation_mask']
-    target_mapping = inputs['target_mapping']
-    state = inputs.get('state', None)
-
-    attention_output, state = self._network(
-        input_ids=input_word_ids,
-        segment_ids=input_type_ids,
-        input_mask=None,
-        state=state,
-        permutation_mask=permutation_mask,
-        target_mapping=target_mapping,
-        masked_tokens=masked_tokens)
-
-    embedding_table = self._network.get_embedding_lookup_table()
-    mlm_outputs = self._masked_lm(
-        sequence_data=attention_output,
-        embedding_table=embedding_table)
-    return mlm_outputs, state
-
-  def get_config(self) -> Mapping[str, Any]:
-    return self._config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-  @property
-  def checkpoint_items(self):
-    return dict(encoder=self._network)
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class XLNetClassifier(tf.keras.Model):
-  """Classifier model based on XLNet.
-
-  This is an implementation of the network structure surrounding a
-  Transformer-XL encoder as described in "XLNet: Generalized Autoregressive
-  Pretraining for Language Understanding" (https://arxiv.org/abs/1906.08237).
-
-  Note: This model does not use utilize the memory mechanism used in the
-  original XLNet Classifier.
-
-  Args:
-    network: An XLNet/Transformer-XL based network. This network should output a
-      sequence output and list of `state` tensors.
-    num_classes: Number of classes to predict from the classification network.
-    initializer: The initializer (if any) to use in the classification networks.
-      Defaults to a RandomNormal initializer.
-    summary_type: Method used to summarize a sequence into a compact vector.
-    dropout_rate: The dropout probability of the cls head.
-  """
-
-  def __init__(
-      self,
-      network: Union[tf.keras.layers.Layer, tf.keras.Model],
-      num_classes: int,
-      initializer: tf.keras.initializers.Initializer = 'random_normal',
-      summary_type: str = 'last',
-      dropout_rate: float = 0.1,
-      **kwargs):
-    super().__init__(**kwargs)
-    self._network = network
-    self._initializer = initializer
-    self._summary_type = summary_type
-    self._num_classes = num_classes
-    self._config = {
-        'network': network,
-        'initializer': initializer,
-        'num_classes': num_classes,
-        'summary_type': summary_type,
-        'dropout_rate': dropout_rate,
-    }
-
-    if summary_type == 'last':
-      cls_token_idx = -1
-    elif summary_type == 'first':
-      cls_token_idx = 0
-    else:
-      raise ValueError('Invalid summary type provided: %s.' % summary_type)
-
-    self.classifier = layers.ClassificationHead(
-        inner_dim=network.get_config()['hidden_size'],
-        num_classes=num_classes,
-        initializer=initializer,
-        dropout_rate=dropout_rate,
-        cls_token_idx=cls_token_idx,
-        name='sentence_prediction')
-
-  def call(self, inputs: Mapping[str, Any]):
-    input_ids = inputs['input_word_ids']
-    segment_ids = inputs['input_type_ids']
-    input_mask = tf.cast(inputs['input_mask'], tf.float32)
-    state = inputs.get('mems', None)
-
-    attention_output, _ = self._network(
-        input_ids=input_ids,
-        segment_ids=segment_ids,
-        input_mask=input_mask,
-        state=state)
-
-    logits = self.classifier(attention_output)
-
-    return logits
-
-  def get_config(self):
-    return self._config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-  @property
-  def checkpoint_items(self):
-    items = dict(encoder=self._network)
-    if hasattr(self.classifier, 'checkpoint_items'):
-      for key, item in self.classifier.checkpoint_items.items():
-        items['.'.join([self.classifier.name, key])] = item
-    return items
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class XLNetSpanLabeler(tf.keras.Model):
-  """Span labeler model based on XLNet.
-
-  This is an implementation of the network structure surrounding a
-  Transformer-XL encoder as described in "XLNet: Generalized Autoregressive
-  Pretraining for Language Understanding" (https://arxiv.org/abs/1906.08237).
-
-  Args:
-    network: A transformer network. This network should output a sequence output
-      and a classification output. Furthermore, it should expose its embedding
-      table via a "get_embedding_table" method.
-    start_n_top: Beam size for span start.
-    end_n_top: Beam size for span end.
-    dropout_rate: The dropout rate for the span labeling layer.
-    span_labeling_activation: The activation for the span labeling head.
-    initializer: The initializer (if any) to use in the span labeling network.
-      Defaults to a Glorot uniform initializer.
-  """
-
-  def __init__(
-      self,
-      network: Union[tf.keras.layers.Layer, tf.keras.Model],
-      start_n_top: int = 5,
-      end_n_top: int = 5,
-      dropout_rate: float = 0.1,
-      span_labeling_activation: tf.keras.initializers.Initializer = 'tanh',
-      initializer: tf.keras.initializers.Initializer = 'glorot_uniform',
-      **kwargs):
-    super().__init__(**kwargs)
-    self._config = {
-        'network': network,
-        'start_n_top': start_n_top,
-        'end_n_top': end_n_top,
-        'dropout_rate': dropout_rate,
-        'span_labeling_activation': span_labeling_activation,
-        'initializer': initializer,
-    }
-    network_config = network.get_config()
-    try:
-      input_width = network_config['inner_size']
-      self._xlnet_base = True
-    except KeyError:
-      # BertEncoder uses 'intermediate_size' due to legacy naming.
-      input_width = network_config['intermediate_size']
-      self._xlnet_base = False
-
-    self._network = network
-    self._initializer = initializer
-    self._start_n_top = start_n_top
-    self._end_n_top = end_n_top
-    self._dropout_rate = dropout_rate
-    self._activation = span_labeling_activation
-    self.span_labeling = networks.XLNetSpanLabeling(
-        input_width=input_width,
-        start_n_top=self._start_n_top,
-        end_n_top=self._end_n_top,
-        activation=self._activation,
-        dropout_rate=self._dropout_rate,
-        initializer=self._initializer)
-
-  def call(self, inputs: Mapping[str, Any]):
-    input_word_ids = inputs['input_word_ids']
-    input_type_ids = inputs['input_type_ids']
-    input_mask = inputs['input_mask']
-    class_index = inputs['class_index']
-    paragraph_mask = inputs['paragraph_mask']
-    start_positions = inputs.get('start_positions', None)
-
-    if self._xlnet_base:
-      attention_output, _ = self._network(
-          input_ids=input_word_ids,
-          segment_ids=input_type_ids,
-          input_mask=input_mask)
-    else:
-      network_output_dict = self._network(dict(
-          input_word_ids=input_word_ids,
-          input_type_ids=input_type_ids,
-          input_mask=input_mask))
-      attention_output = network_output_dict['sequence_output']
-
-    outputs = self.span_labeling(
-        sequence_data=attention_output,
-        class_index=class_index,
-        paragraph_mask=paragraph_mask,
-        start_positions=start_positions)
-    return outputs
-
-  @property
-  def checkpoint_items(self):
-    return dict(encoder=self._network)
-
-  def get_config(self):
-    return self._config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/xlnet_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/xlnet_test.py
deleted file mode 100644
index 31a2e511fe150c6af4bf1a6b55218721a6bbfef1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/models/xlnet_test.py
+++ /dev/null
@@ -1,344 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for XLNet classifier network."""
-
-from absl.testing import parameterized
-
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling import networks
-from official.nlp.modeling.models import xlnet
-
-
-def _get_xlnet_base() -> tf.keras.layers.Layer:
-  """Returns a trivial base XLNet model."""
-  return networks.XLNetBase(
-      vocab_size=100,
-      num_layers=2,
-      hidden_size=4,
-      num_attention_heads=2,
-      head_size=2,
-      inner_size=2,
-      dropout_rate=0.,
-      attention_dropout_rate=0.,
-      attention_type='bi',
-      bi_data=True,
-      initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-      two_stream=False,
-      tie_attention_biases=True,
-      reuse_length=0,
-      inner_activation='relu')
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class XLNetMaskedLMTest(keras_parameterized.TestCase):
-
-  def test_xlnet_masked_lm_head(self):
-    hidden_size = 10
-    seq_length = 8
-    batch_size = 2
-    masked_lm = xlnet.XLNetMaskedLM(vocab_size=10,
-                                    hidden_size=hidden_size,
-                                    initializer='glorot_uniform')
-    sequence_data = np.random.uniform(size=(batch_size, seq_length))
-    embedding_table = np.random.uniform(size=(hidden_size, hidden_size))
-    mlm_output = masked_lm(sequence_data, embedding_table)
-    self.assertAllClose(mlm_output.shape, (batch_size, hidden_size))
-
-
-@keras_parameterized.run_all_keras_modes
-class XLNetPretrainerTest(keras_parameterized.TestCase):
-
-  def test_xlnet_trainer(self):
-    """Validates that the Keras object can be created."""
-    seq_length = 4
-    num_predictions = 2
-    # Build a simple XLNet based network to use with the XLNet trainer.
-    xlnet_base = _get_xlnet_base()
-
-    # Create an XLNet trainer with the created network.
-    xlnet_trainer_model = xlnet.XLNetPretrainer(network=xlnet_base)
-    inputs = dict(
-        input_word_ids=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='input_word_ids'),
-        input_type_ids=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='input_type_ids'),
-        input_mask=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='input_mask'),
-        permutation_mask=tf.keras.layers.Input(
-            shape=(seq_length, seq_length,), dtype=tf.int32,
-            name='permutation_mask'),
-        target_mapping=tf.keras.layers.Input(
-            shape=(num_predictions, seq_length), dtype=tf.int32,
-            name='target_mapping'),
-        masked_tokens=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='masked_tokens'))
-    logits, _ = xlnet_trainer_model(inputs)
-
-    # [None, hidden_size, vocab_size]
-    expected_output_shape = [None, 4, 100]
-    self.assertAllEqual(expected_output_shape, logits.shape.as_list())
-
-  def test_xlnet_tensor_call(self):
-    """Validates that the Keras object can be invoked."""
-    seq_length = 4
-    batch_size = 2
-    num_predictions = 2
-    # Build a simple XLNet based network to use with the XLNet trainer.
-    xlnet_base = _get_xlnet_base()
-
-    # Create an XLNet trainer with the created network.
-    xlnet_trainer_model = xlnet.XLNetPretrainer(network=xlnet_base)
-
-    sequence_shape = (batch_size, seq_length)
-    inputs = dict(
-        input_word_ids=np.random.randint(
-            10, size=sequence_shape, dtype='int32'),
-        input_type_ids=np.random.randint(2, size=sequence_shape, dtype='int32'),
-        input_mask=np.random.randint(2, size=sequence_shape).astype('int32'),
-        permutation_mask=np.random.randint(
-            2, size=(batch_size, seq_length, seq_length)).astype('int32'),
-        target_mapping=np.random.randint(
-            10, size=(num_predictions, seq_length), dtype='int32'),
-        masked_tokens=np.random.randint(
-            10, size=sequence_shape, dtype='int32'))
-    xlnet_trainer_model(inputs)
-
-  def test_serialize_deserialize(self):
-    """Validates that the XLNet trainer can be serialized and deserialized."""
-    # Build a simple XLNet based network to use with the XLNet trainer.
-    xlnet_base = _get_xlnet_base()
-
-    # Create an XLNet trainer with the created network.
-    xlnet_trainer_model = xlnet.XLNetPretrainer(
-        network=xlnet_base,
-        mlm_activation='gelu',
-        mlm_initializer='random_normal')
-
-    # Create another XLNet trainer via serialization and deserialization.
-    config = xlnet_trainer_model.get_config()
-    new_xlnet_trainer_model = xlnet.XLNetPretrainer.from_config(
-        config)
-
-    # Validate that the config can be forced to JSON.
-    _ = new_xlnet_trainer_model.to_json()
-
-    # If serialization was successful, then the new config should match the old.
-    self.assertAllEqual(xlnet_trainer_model.get_config(),
-                        new_xlnet_trainer_model.get_config())
-
-
-@keras_parameterized.run_all_keras_modes
-class XLNetClassifierTest(keras_parameterized.TestCase):
-
-  def test_xlnet_trainer(self):
-    """Validate that the Keras object can be created."""
-    num_classes = 2
-    seq_length = 4
-    # Build a simple XLNet based network to use with the XLNet trainer.
-    xlnet_base = _get_xlnet_base()
-
-    # Create an XLNet trainer with the created network.
-    xlnet_trainer_model = xlnet.XLNetClassifier(
-        network=xlnet_base,
-        num_classes=num_classes,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        summary_type='last',
-        dropout_rate=0.1)
-    inputs = dict(
-        input_word_ids=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='input_word_ids'),
-        input_type_ids=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='input_type_ids'),
-        input_mask=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='input_mask'),
-        permutation_mask=tf.keras.layers.Input(
-            shape=(seq_length, seq_length,), dtype=tf.int32,
-            name='permutation_mask'),
-        masked_tokens=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='masked_tokens'))
-    logits = xlnet_trainer_model(inputs)
-
-    expected_classification_shape = [None, num_classes]
-    self.assertAllEqual(expected_classification_shape, logits.shape.as_list())
-
-  @parameterized.parameters(1, 2)
-  def test_xlnet_tensor_call(self, num_classes):
-    """Validates that the Keras object can be invoked."""
-    seq_length = 4
-    batch_size = 2
-    # Build a simple XLNet based network to use with the XLNet trainer.
-    xlnet_base = _get_xlnet_base()
-
-    # Create an XLNet trainer with the created network.
-    xlnet_trainer_model = xlnet.XLNetClassifier(
-        network=xlnet_base,
-        num_classes=num_classes,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        summary_type='last',
-        dropout_rate=0.1)
-
-    sequence_shape = (batch_size, seq_length)
-    inputs = dict(
-        input_word_ids=np.random.randint(
-            10, size=sequence_shape, dtype='int32'),
-        input_type_ids=np.random.randint(2, size=sequence_shape, dtype='int32'),
-        input_mask=np.random.randint(2, size=sequence_shape).astype('int32'),
-        permutation_mask=np.random.randint(
-            2, size=(batch_size, seq_length, seq_length)).astype('int32'),
-        masked_tokens=np.random.randint(
-            10, size=sequence_shape, dtype='int32'))
-    xlnet_trainer_model(inputs)
-
-  def test_serialize_deserialize(self):
-    """Validates that the XLNet trainer can be serialized and deserialized."""
-    # Build a simple XLNet based network to use with the XLNet trainer.
-    xlnet_base = _get_xlnet_base()
-
-    # Create an XLNet trainer with the created network.
-    xlnet_trainer_model = xlnet.XLNetClassifier(
-        network=xlnet_base,
-        num_classes=2,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        summary_type='last',
-        dropout_rate=0.1)
-
-    # Create another XLNet trainer via serialization and deserialization.
-    config = xlnet_trainer_model.get_config()
-    new_xlnet_trainer_model = xlnet.XLNetClassifier.from_config(
-        config)
-
-    # Validate that the config can be forced to JSON.
-    _ = new_xlnet_trainer_model.to_json()
-
-    # If serialization was successful, then the new config should match the old.
-    self.assertAllEqual(xlnet_trainer_model.get_config(),
-                        new_xlnet_trainer_model.get_config())
-
-
-@keras_parameterized.run_all_keras_modes
-class XLNetSpanLabelerTest(keras_parameterized.TestCase):
-
-  def test_xlnet_trainer(self):
-    """Validate that the Keras object can be created."""
-    top_n = 2
-    seq_length = 4
-    # Build a simple XLNet based network to use with the XLNet trainer.
-    xlnet_base = _get_xlnet_base()
-
-    # Create an XLNet trainer with the created network.
-    xlnet_trainer_model = xlnet.XLNetSpanLabeler(
-        network=xlnet_base,
-        start_n_top=top_n,
-        end_n_top=top_n,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        span_labeling_activation='tanh',
-        dropout_rate=0.1)
-    inputs = dict(
-        input_word_ids=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='input_word_ids'),
-        input_type_ids=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='input_type_ids'),
-        input_mask=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='input_mask'),
-        paragraph_mask=tf.keras.layers.Input(
-            shape=(seq_length,), dtype=tf.int32, name='paragraph_mask'),
-        class_index=tf.keras.layers.Input(
-            shape=(), dtype=tf.int32, name='class_index'),
-        start_positions=tf.keras.layers.Input(
-            shape=(), dtype=tf.int32, name='start_positions'))
-    outputs = xlnet_trainer_model(inputs)
-    self.assertIsInstance(outputs, dict)
-
-    # Test tensor value calls for the created model.
-    batch_size = 2
-    sequence_shape = (batch_size, seq_length)
-    inputs = dict(
-        input_word_ids=np.random.randint(
-            10, size=sequence_shape, dtype='int32'),
-        input_type_ids=np.random.randint(2, size=sequence_shape, dtype='int32'),
-        input_mask=np.random.randint(2, size=sequence_shape).astype('int32'),
-        paragraph_mask=np.random.randint(
-            1, size=(sequence_shape)).astype('int32'),
-        class_index=np.random.randint(1, size=(batch_size)).astype('uint8'),
-        start_positions=tf.random.uniform(
-            shape=(batch_size,), maxval=5, dtype=tf.int32))
-
-    common_keys = {
-        'start_logits', 'end_logits', 'start_predictions', 'end_predictions',
-        'class_logits',
-    }
-    inference_keys = {
-        'start_top_predictions', 'end_top_predictions', 'start_top_index',
-        'end_top_index',
-    }
-
-    outputs = xlnet_trainer_model(inputs)
-    self.assertSetEqual(common_keys | inference_keys, set(outputs.keys()))
-
-    outputs = xlnet_trainer_model(inputs, training=True)
-    self.assertIsInstance(outputs, dict)
-    self.assertSetEqual(common_keys, set(outputs.keys()))
-    self.assertIsInstance(outputs, dict)
-
-  def test_serialize_deserialize(self):
-    """Validates that the XLNet trainer can be serialized and deserialized."""
-    # Build a simple XLNet based network to use with the XLNet trainer.
-    xlnet_base = _get_xlnet_base()
-
-    # Create an XLNet trainer with the created network.
-    xlnet_trainer_model = xlnet.XLNetSpanLabeler(
-        network=xlnet_base,
-        start_n_top=2,
-        end_n_top=2,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        span_labeling_activation='tanh',
-        dropout_rate=0.1)
-
-    # Create another XLNet trainer via serialization and deserialization.
-    config = xlnet_trainer_model.get_config()
-    new_xlnet_trainer_model = xlnet.XLNetSpanLabeler.from_config(
-        config)
-
-    # Validate that the config can be forced to JSON.
-    _ = new_xlnet_trainer_model.to_json()
-
-    # If serialization was successful, then the new config should match the old.
-    self.assertAllEqual(xlnet_trainer_model.get_config(),
-                        new_xlnet_trainer_model.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/README.md
deleted file mode 100644
index b192399a7276ef122725f40d2b0e3d237805e644..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Networks
-
-Networks are combinations of `tf.keras` layers (and possibly other networks).
-They are `tf.keras` models that would not be trained alone. It encapsulates
-common network structures like a transformer encoder into an easily
-handled object with a standardized configuration.
-
-* [`BertEncoder`](bert_encoder.py) implements a bi-directional
-Transformer-based encoder as described in ["BERT: Pre-training of Deep
-Bidirectional Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805).
-It includes the embedding lookups, transformer layers and pooling layer.
-
-* [`AlbertEncoder`](albert_encoder.py) implements a
-Transformer-encoder described in the paper ["ALBERT: A Lite BERT for
-Self-supervised Learning of Language Representations"]
-(https://arxiv.org/abs/1909.11942). Compared with [BERT](https://arxiv.org/abs/1810.04805),
-ALBERT refactorizes embedding parameters into two smaller matrices and shares
-parameters across layers.
-
-* [`MobileBERTEncoder`](mobile_bert_encoder.py) implements the
-MobileBERT network described in the paper ["MobileBERT: a Compact Task-Agnostic
-BERT for Resource-Limited Devices"](https://arxiv.org/abs/2004.02984).
-
-* [`Classification`](classification.py) contains a single hidden layer, and is
-intended for use as a classification or regression (if number of classes is set
-to 1) head.
-
-* [`PackedSequenceEmbedding`](packed_sequence_embedding.py) implements an
-embedding network that supports packed sequences and position ids.
-
-* [`SpanLabeling`](span_labeling.py) implements a single-span labeler
-(that is, a prediction head that can predict one start and end index per batch
-item) based on a single dense hidden layer. It can be used in the SQuAD task.
-
-* [`XLNetBase`](xlnet_base.py) implements the base network used in "XLNet:
-Generalized Autoregressive Pretraining for Language Understanding"
-(https://arxiv.org/abs/1906.08237). It includes embedding lookups,
-relative position encodings, mask computations, segment matrix computations and
-Transformer XL layers using one or two stream relative self-attention.
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/__init__.py
deleted file mode 100644
index 084bb4c2aaf14c1aac77fc2afd078b120b44fa28..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/__init__.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Networks are combinations of `tf.keras` layers (and possibly other networks).
-
-They are `tf.keras` models that would not be trained alone. It encapsulates
-common network structures like a transformer encoder into an easily
-handled object with a standardized configuration.
-"""
-from official.nlp.modeling.networks.albert_encoder import AlbertEncoder
-from official.nlp.modeling.networks.bert_encoder import BertEncoder
-from official.nlp.modeling.networks.classification import Classification
-from official.nlp.modeling.networks.encoder_scaffold import EncoderScaffold
-from official.nlp.modeling.networks.mobile_bert_encoder import MobileBERTEncoder
-from official.nlp.modeling.networks.packed_sequence_embedding import PackedSequenceEmbedding
-from official.nlp.modeling.networks.span_labeling import SpanLabeling
-from official.nlp.modeling.networks.span_labeling import XLNetSpanLabeling
-from official.nlp.modeling.networks.xlnet_base import XLNetBase
-# Backward compatibility. The modules are deprecated.
-TransformerEncoder = BertEncoder
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/albert_encoder.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/albert_encoder.py
deleted file mode 100644
index 0f4f98e201bb5d8cd1c8321d6517592b423981f0..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/albert_encoder.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""ALBERT (https://arxiv.org/abs/1810.04805) text encoder network."""
-# pylint: disable=g-classes-have-attributes
-import collections
-import tensorflow as tf
-
-from official.modeling import activations
-from official.nlp import keras_nlp
-from official.nlp.modeling import layers
-from official.nlp.bert import npu_convert_dropout
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class AlbertEncoder(tf.keras.Model):
-  """ALBERT (https://arxiv.org/abs/1810.04805) text encoder network.
-
-  This network implements the encoder described in the paper "ALBERT: A Lite
-  BERT for Self-supervised Learning of Language Representations"
-  (https://arxiv.org/abs/1909.11942).
-
-  Compared with BERT (https://arxiv.org/abs/1810.04805), ALBERT refactorizes
-  embedding parameters into two smaller matrices and shares parameters
-  across layers.
-
-  The default values for this object are taken from the ALBERT-Base
-  implementation described in the paper.
-
-  *Note* that the network is constructed by Keras Functional API.
-
-  Args:
-    vocab_size: The size of the token vocabulary.
-    embedding_width: The width of the word embeddings. If the embedding width is
-      not equal to hidden size, embedding parameters will be factorized into two
-      matrices in the shape of `(vocab_size, embedding_width)` and
-      `(embedding_width, hidden_size)`, where `embedding_width` is usually much
-      smaller than `hidden_size`.
-    hidden_size: The size of the transformer hidden layers.
-    num_layers: The number of transformer layers.
-    num_attention_heads: The number of attention heads for each transformer. The
-      hidden size must be divisible by the number of attention heads.
-    max_sequence_length: The maximum sequence length that this encoder can
-      consume. If None, max_sequence_length uses the value from sequence length.
-      This determines the variable shape for positional embeddings.
-    type_vocab_size: The number of types that the 'type_ids' input can take.
-    intermediate_size: The intermediate size for the transformer layers.
-    activation: The activation to use for the transformer layers.
-    dropout_rate: The dropout rate to use for the transformer layers.
-    attention_dropout_rate: The dropout rate to use for the attention layers
-      within the transformer layers.
-    initializer: The initialzer to use for all weights in this encoder.
-    dict_outputs: Whether to use a dictionary as the model outputs.
-  """
-
-  def __init__(self,
-               vocab_size,
-               embedding_width=128,
-               hidden_size=768,
-               num_layers=12,
-               num_attention_heads=12,
-               max_sequence_length=512,
-               type_vocab_size=16,
-               intermediate_size=3072,
-               activation=activations.gelu,
-               dropout_rate=0.1,
-               attention_dropout_rate=0.1,
-               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-               dict_outputs=False,
-               **kwargs):
-    activation = tf.keras.activations.get(activation)
-    initializer = tf.keras.initializers.get(initializer)
-
-    word_ids = tf.keras.layers.Input(
-        shape=(None,), dtype=tf.int32, name='input_word_ids')
-    mask = tf.keras.layers.Input(
-        shape=(None,), dtype=tf.int32, name='input_mask')
-    type_ids = tf.keras.layers.Input(
-        shape=(None,), dtype=tf.int32, name='input_type_ids')
-
-    if embedding_width is None:
-      embedding_width = hidden_size
-    embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=embedding_width,
-        initializer=initializer,
-        name='word_embeddings')
-    word_embeddings = embedding_layer(word_ids)
-
-    # Always uses dynamic slicing for simplicity.
-    position_embedding_layer = keras_nlp.layers.PositionEmbedding(
-        initializer=initializer,
-        max_length=max_sequence_length,
-        name='position_embedding')
-    position_embeddings = position_embedding_layer(word_embeddings)
-
-    type_embeddings = (
-        layers.OnDeviceEmbedding(
-            vocab_size=type_vocab_size,
-            embedding_width=embedding_width,
-            initializer=initializer,
-            use_one_hot=True,
-            name='type_embeddings')(type_ids))
-
-    embeddings = tf.keras.layers.Add()(
-        [word_embeddings, position_embeddings, type_embeddings])
-    embeddings = (
-        tf.keras.layers.LayerNormalization(
-            name='embeddings/layer_norm',
-            axis=-1,
-            epsilon=1e-12,
-            dtype=tf.float32)(embeddings))
-    embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))
-    # We project the 'embedding' output to 'hidden_size' if it is not already
-    # 'hidden_size'.
-    if embedding_width != hidden_size:
-      embeddings = tf.keras.layers.experimental.EinsumDense(
-          '...x,xy->...y',
-          output_shape=hidden_size,
-          bias_axes='y',
-          kernel_initializer=initializer,
-          name='embedding_projection')(
-              embeddings)
-
-    data = embeddings
-    attention_mask = keras_nlp.layers.SelfAttentionMask()(data, mask)
-    shared_layer = keras_nlp.layers.TransformerEncoderBlock(
-        num_attention_heads=num_attention_heads,
-        inner_dim=intermediate_size,
-        inner_activation=activation,
-        output_dropout=dropout_rate,
-        attention_dropout=attention_dropout_rate,
-        kernel_initializer=initializer,
-        name='transformer')
-    encoder_outputs = []
-    for _ in range(num_layers):
-      data = shared_layer([data, attention_mask])
-      encoder_outputs.append(data)
-
-    # Applying a tf.slice op (through subscript notation) to a Keras tensor
-    # like this will create a SliceOpLambda layer. This is better than a Lambda
-    # layer with Python code, because that is fundamentally less portable.
-    first_token_tensor = data[:, 0, :]
-    cls_output = tf.keras.layers.Dense(
-        units=hidden_size,
-        activation='tanh',
-        kernel_initializer=initializer,
-        name='pooler_transform')(
-            first_token_tensor)
-    if dict_outputs:
-      outputs = dict(
-          sequence_output=data,
-          encoder_outputs=encoder_outputs,
-          pooled_output=cls_output,
-      )
-    else:
-      outputs = [data, cls_output]
-
-    # b/164516224
-    # Once we've created the network using the Functional API, we call
-    # super().__init__ as though we were invoking the Functional API Model
-    # constructor, resulting in this object having all the properties of a model
-    # created using the Functional API. Once super().__init__ is called, we
-    # can assign attributes to `self` - note that all `self` assignments are
-    # below this line.
-    super(AlbertEncoder, self).__init__(
-        inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
-    config_dict = {
-        'vocab_size': vocab_size,
-        'embedding_width': embedding_width,
-        'hidden_size': hidden_size,
-        'num_layers': num_layers,
-        'num_attention_heads': num_attention_heads,
-        'max_sequence_length': max_sequence_length,
-        'type_vocab_size': type_vocab_size,
-        'intermediate_size': intermediate_size,
-        'activation': tf.keras.activations.serialize(activation),
-        'dropout_rate': dropout_rate,
-        'attention_dropout_rate': attention_dropout_rate,
-        'initializer': tf.keras.initializers.serialize(initializer),
-    }
-
-    # We are storing the config dict as a namedtuple here to ensure checkpoint
-    # compatibility with an earlier version of this model which did not track
-    # the config dict attribute. TF does not track immutable attrs which
-    # do not contain Trackables, so by creating a config namedtuple instead of
-    # a dict we avoid tracking it.
-    config_cls = collections.namedtuple('Config', config_dict.keys())
-    self._config = config_cls(**config_dict)
-    self._embedding_layer = embedding_layer
-    self._position_embedding_layer = position_embedding_layer
-
-  def get_embedding_table(self):
-    return self._embedding_layer.embeddings
-
-  def get_config(self):
-    return dict(self._config._asdict())
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/albert_encoder_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/albert_encoder_test.py
deleted file mode 100644
index bec2917764fbe3a588d5cd9e576a582fc3a26242..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/albert_encoder_test.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for ALBERT transformer-based text encoder network."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.networks import albert_encoder
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class AlbertEncoderTest(keras_parameterized.TestCase):
-
-  def tearDown(self):
-    super(AlbertEncoderTest, self).tearDown()
-    tf.keras.mixed_precision.set_global_policy("float32")
-
-  @parameterized.named_parameters(
-      dict(testcase_name="default", expected_dtype=tf.float32),
-      dict(testcase_name="with_float16_dtype", expected_dtype=tf.float16),
-  )
-  def test_network_creation(self, expected_dtype):
-    hidden_size = 32
-    sequence_length = 21
-
-    kwargs = dict(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3)
-    if expected_dtype == tf.float16:
-      tf.keras.mixed_precision.set_global_policy("mixed_float16")
-
-    # Create a small TransformerEncoder for testing.
-    test_network = albert_encoder.AlbertEncoder(**kwargs)
-
-    # Create the inputs (note that the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    data, pooled = test_network([word_ids, mask, type_ids])
-
-    expected_data_shape = [None, sequence_length, hidden_size]
-    expected_pooled_shape = [None, hidden_size]
-    self.assertAllEqual(expected_data_shape, data.shape.as_list())
-    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
-
-    # If float_dtype is set to float16, the data output is float32 (from a layer
-    # norm) and pool output should be float16.
-    self.assertEqual(tf.float32, data.dtype)
-    self.assertEqual(expected_dtype, pooled.dtype)
-
-    # ALBERT has additonal 'embedding_hidden_mapping_in' weights and
-    # it shares transformer weights.
-    self.assertNotEmpty(
-        [x for x in test_network.weights if "embedding_projection/" in x.name])
-    self.assertNotEmpty(
-        [x for x in test_network.weights if "transformer/" in x.name])
-    self.assertEmpty(
-        [x for x in test_network.weights if "transformer/layer" in x.name])
-
-  def test_network_invocation(self):
-    hidden_size = 32
-    sequence_length = 21
-    vocab_size = 57
-    num_types = 7
-    num_layers = 3
-    # Create a small TransformerEncoder for testing.
-    test_network = albert_encoder.AlbertEncoder(
-        vocab_size=vocab_size,
-        embedding_width=8,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=num_layers,
-        type_vocab_size=num_types)
-    # Create the inputs (note that the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    data, pooled = test_network([word_ids, mask, type_ids])
-
-    # Create a model based off of this network:
-    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
-
-    # Invoke the model. We can't validate the output data here (the model is too
-    # complex) but this will catch structural runtime errors.
-    batch_size = 3
-    word_id_data = np.random.randint(
-        vocab_size, size=(batch_size, sequence_length))
-    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
-    type_id_data = np.random.randint(
-        num_types, size=(batch_size, sequence_length))
-    list_outputs = model.predict([word_id_data, mask_data, type_id_data])
-
-    # Creates a TransformerEncoder with max_sequence_length != sequence_length
-    max_sequence_length = 128
-    test_network = albert_encoder.AlbertEncoder(
-        vocab_size=vocab_size,
-        embedding_width=8,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=num_layers,
-        type_vocab_size=num_types)
-    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
-    _ = model.predict([word_id_data, mask_data, type_id_data])
-
-    # Tests dictionary outputs.
-    test_network_dict = albert_encoder.AlbertEncoder(
-        vocab_size=vocab_size,
-        embedding_width=8,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=num_layers,
-        type_vocab_size=num_types,
-        dict_outputs=True)
-    _ = test_network_dict([word_ids, mask, type_ids])
-    test_network_dict.set_weights(test_network.get_weights())
-    list_outputs = test_network([word_id_data, mask_data, type_id_data])
-    dict_outputs = test_network_dict(
-        dict(
-            input_word_ids=word_id_data,
-            input_mask=mask_data,
-            input_type_ids=type_id_data))
-    self.assertAllEqual(list_outputs[0], dict_outputs["sequence_output"])
-    self.assertAllEqual(list_outputs[1], dict_outputs["pooled_output"])
-    self.assertLen(dict_outputs["pooled_output"], num_layers)
-
-  def test_serialize_deserialize(self):
-    tf.keras.mixed_precision.set_global_policy("mixed_float16")
-    # Create a network object that sets all of its config options.
-    kwargs = dict(
-        vocab_size=100,
-        embedding_width=8,
-        hidden_size=32,
-        num_layers=3,
-        num_attention_heads=2,
-        max_sequence_length=21,
-        type_vocab_size=12,
-        intermediate_size=1223,
-        activation="relu",
-        dropout_rate=0.05,
-        attention_dropout_rate=0.22,
-        initializer="glorot_uniform")
-    network = albert_encoder.AlbertEncoder(**kwargs)
-
-    expected_config = dict(kwargs)
-    expected_config["activation"] = tf.keras.activations.serialize(
-        tf.keras.activations.get(expected_config["activation"]))
-    expected_config["initializer"] = tf.keras.initializers.serialize(
-        tf.keras.initializers.get(expected_config["initializer"]))
-    self.assertEqual(network.get_config(), expected_config)
-
-    # Create another network object from the first object's config.
-    new_network = (
-        albert_encoder.AlbertEncoder.from_config(
-            network.get_config()))
-
-    # Validate that the config can be forced to JSON.
-    _ = new_network.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(network.get_config(), new_network.get_config())
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/bert_encoder.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/bert_encoder.py
deleted file mode 100644
index 7dc3452f8569f61b3398c743424c1d4389cdcccc..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/bert_encoder.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Transformer-based text encoder network."""
-# pylint: disable=g-classes-have-attributes
-import collections
-import tensorflow as tf
-
-from official.modeling import activations
-from official.nlp import keras_nlp
-
-
-# This class is being replaced by keras_nlp.encoders.BertEncoder and merely
-# acts as a wrapper if you need: 1) list outputs instead of dict outputs,
-# 2) shared embedding layer.
-@tf.keras.utils.register_keras_serializable(package='Text')
-class BertEncoder(keras_nlp.encoders.BertEncoder):
-  """Bi-directional Transformer-based encoder network.
-
-  This network implements a bi-directional Transformer-based encoder as
-  described in "BERT: Pre-training of Deep Bidirectional Transformers for
-  Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
-  embedding lookups and transformer layers, but not the masked language model
-  or classification task networks.
-
-  The default values for this object are taken from the BERT-Base implementation
-  in "BERT: Pre-training of Deep Bidirectional Transformers for Language
-  Understanding".
-
-  *Note* that the network is constructed by
-  [Keras Functional API](https://keras.io/guides/functional_api/).
-
-  Args:
-    vocab_size: The size of the token vocabulary.
-    hidden_size: The size of the transformer hidden layers.
-    num_layers: The number of transformer layers.
-    num_attention_heads: The number of attention heads for each transformer. The
-      hidden size must be divisible by the number of attention heads.
-    sequence_length: [Deprecated]. TODO(hongkuny): remove this argument once no
-      user is using it.
-    max_sequence_length: The maximum sequence length that this encoder can
-      consume. If None, max_sequence_length uses the value from sequence length.
-      This determines the variable shape for positional embeddings.
-    type_vocab_size: The number of types that the 'type_ids' input can take.
-    intermediate_size: The intermediate size for the transformer layers.
-    activation: The activation to use for the transformer layers.
-    dropout_rate: The dropout rate to use for the transformer layers.
-    attention_dropout_rate: The dropout rate to use for the attention layers
-      within the transformer layers.
-    initializer: The initialzer to use for all weights in this encoder.
-    return_all_encoder_outputs: Whether to output sequence embedding outputs of
-      all encoder transformer layers. Note: when the following `dict_outputs`
-      argument is True, all encoder outputs are always returned in the dict,
-      keyed by `encoder_outputs`.
-    output_range: The sequence output range, [0, output_range), by slicing the
-      target sequence of the last transformer layer. `None` means the entire
-      target sequence will attend to the source sequence, which yields the full
-      output.
-    embedding_width: The width of the word embeddings. If the embedding width is
-      not equal to hidden size, embedding parameters will be factorized into two
-      matrices in the shape of `(vocab_size, embedding_width)` and
-      `(embedding_width, hidden_size)`, where `embedding_width` is usually much
-      smaller than `hidden_size`.
-    embedding_layer: The word embedding layer. `None` means we will create a new
-      embedding layer. Otherwise, we will reuse the given embedding layer. This
-      parameter is originally added for ELECTRA model which needs to tie the
-      generator embeddings with the discriminator embeddings.
-    dict_outputs: Whether to use a dictionary as the model outputs.
-  """
-
-  def __init__(self,
-               vocab_size,
-               hidden_size=768,
-               num_layers=12,
-               num_attention_heads=12,
-               sequence_length=None,
-               max_sequence_length=512,
-               type_vocab_size=16,
-               intermediate_size=3072,
-               activation=activations.gelu,
-               dropout_rate=0.1,
-               attention_dropout_rate=0.1,
-               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-               return_all_encoder_outputs=False,
-               output_range=None,
-               embedding_width=None,
-               embedding_layer=None,
-               dict_outputs=False,
-               **kwargs):
-
-    # b/164516224
-    # Once we've created the network using the Functional API, we call
-    # super().__init__ as though we were invoking the Functional API Model
-    # constructor, resulting in this object having all the properties of a model
-    # created using the Functional API. Once super().__init__ is called, we
-    # can assign attributes to `self` - note that all `self` assignments are
-    # below this line.
-    super(BertEncoder, self).__init__(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        num_layers=num_layers,
-        num_attention_heads=num_attention_heads,
-        max_sequence_length=max_sequence_length,
-        type_vocab_size=type_vocab_size,
-        inner_dim=intermediate_size,
-        inner_activation=activation,
-        output_dropout=dropout_rate,
-        attention_dropout=attention_dropout_rate,
-        initializer=initializer,
-        output_range=output_range,
-        embedding_width=embedding_width,
-        embedding_layer=embedding_layer)
-
-    self._embedding_layer_instance = embedding_layer
-
-    # Replace arguments from keras_nlp.encoders.BertEncoder.
-    config_dict = self._config._asdict()
-    config_dict['activation'] = config_dict.pop('inner_activation')
-    config_dict['intermediate_size'] = config_dict.pop('inner_dim')
-    config_dict['dropout_rate'] = config_dict.pop('output_dropout')
-    config_dict['attention_dropout_rate'] = config_dict.pop('attention_dropout')
-    config_dict['dict_outputs'] = dict_outputs
-    config_dict['return_all_encoder_outputs'] = return_all_encoder_outputs
-    config_cls = collections.namedtuple('Config', config_dict.keys())
-    self._config = config_cls(**config_dict)
-
-    if dict_outputs:
-      return
-    else:
-      nested_output = self._nested_outputs
-      cls_output = nested_output['pooled_output']
-      if return_all_encoder_outputs:
-        encoder_outputs = nested_output['encoder_outputs']
-        outputs = [encoder_outputs, cls_output]
-      else:
-        sequence_output = nested_output['sequence_output']
-        outputs = [sequence_output, cls_output]
-    super(keras_nlp.encoders.BertEncoder, self).__init__(
-        inputs=self.inputs, outputs=outputs, **kwargs)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/bert_encoder_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/bert_encoder_test.py
deleted file mode 100644
index ac566c13947b65f0a0010fe5cd1d7c8c4f60f99b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/bert_encoder_test.py
+++ /dev/null
@@ -1,270 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for transformer-based bert encoder network."""
-
-# Import libraries
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.networks import bert_encoder
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class BertEncoderTest(keras_parameterized.TestCase):
-
-  def tearDown(self):
-    super(BertEncoderTest, self).tearDown()
-    tf.keras.mixed_precision.set_global_policy("float32")
-
-  def test_network_creation(self):
-    hidden_size = 32
-    sequence_length = 21
-    # Create a small BertEncoder for testing.
-    test_network = bert_encoder.BertEncoder(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3)
-    # Create the inputs (note that the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    data, pooled = test_network([word_ids, mask, type_ids])
-
-    self.assertIsInstance(test_network.transformer_layers, list)
-    self.assertLen(test_network.transformer_layers, 3)
-    self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
-
-    expected_data_shape = [None, sequence_length, hidden_size]
-    expected_pooled_shape = [None, hidden_size]
-    self.assertAllEqual(expected_data_shape, data.shape.as_list())
-    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
-
-    # The default output dtype is float32.
-    self.assertAllEqual(tf.float32, data.dtype)
-    self.assertAllEqual(tf.float32, pooled.dtype)
-
-    test_network_dict = bert_encoder.BertEncoder(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3,
-        dict_outputs=True)
-    # Create the inputs (note that the first dimension is implicit).
-    inputs = dict(
-        input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids)
-    _ = test_network_dict(inputs)
-
-    test_network_dict.set_weights(test_network.get_weights())
-    batch_size = 2
-    vocab_size = 100
-    num_types = 2
-    word_id_data = np.random.randint(
-        vocab_size, size=(batch_size, sequence_length))
-    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
-    type_id_data = np.random.randint(
-        num_types, size=(batch_size, sequence_length))
-    list_outputs = test_network([word_id_data, mask_data, type_id_data])
-    dict_outputs = test_network_dict(
-        dict(
-            input_word_ids=word_id_data,
-            input_mask=mask_data,
-            input_type_ids=type_id_data))
-    self.assertAllEqual(list_outputs[0], dict_outputs["sequence_output"])
-    self.assertAllEqual(list_outputs[1], dict_outputs["pooled_output"])
-
-  def test_all_encoder_outputs_network_creation(self):
-    hidden_size = 32
-    sequence_length = 21
-    # Create a small BertEncoder for testing.
-    test_network = bert_encoder.BertEncoder(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3,
-        return_all_encoder_outputs=True)
-    # Create the inputs (note that the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    all_encoder_outputs, pooled = test_network([word_ids, mask, type_ids])
-
-    expected_data_shape = [None, sequence_length, hidden_size]
-    expected_pooled_shape = [None, hidden_size]
-    self.assertLen(all_encoder_outputs, 3)
-    for data in all_encoder_outputs:
-      self.assertAllEqual(expected_data_shape, data.shape.as_list())
-    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
-
-    # The default output dtype is float32.
-    self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
-    self.assertAllEqual(tf.float32, pooled.dtype)
-
-  def test_network_creation_with_float16_dtype(self):
-    hidden_size = 32
-    sequence_length = 21
-    tf.keras.mixed_precision.set_global_policy("mixed_float16")
-    # Create a small BertEncoder for testing.
-    test_network = bert_encoder.BertEncoder(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3)
-    # Create the inputs (note that the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    data, pooled = test_network([word_ids, mask, type_ids])
-
-    expected_data_shape = [None, sequence_length, hidden_size]
-    expected_pooled_shape = [None, hidden_size]
-    self.assertAllEqual(expected_data_shape, data.shape.as_list())
-    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
-
-    # If float_dtype is set to float16, the data output is float32 (from a layer
-    # norm) and pool output should be float16.
-    self.assertAllEqual(tf.float32, data.dtype)
-    self.assertAllEqual(tf.float16, pooled.dtype)
-
-  @parameterized.named_parameters(
-      ("all_sequence", None, 21),
-      ("output_range", 1, 1),
-  )
-  def test_network_invocation(self, output_range, out_seq_len):
-    hidden_size = 32
-    sequence_length = 21
-    vocab_size = 57
-    num_types = 7
-    # Create a small BertEncoder for testing.
-    test_network = bert_encoder.BertEncoder(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        output_range=output_range)
-    # Create the inputs (note that the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    data, pooled = test_network([word_ids, mask, type_ids])
-
-    # Create a model based off of this network:
-    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
-
-    # Invoke the model. We can't validate the output data here (the model is too
-    # complex) but this will catch structural runtime errors.
-    batch_size = 3
-    word_id_data = np.random.randint(
-        vocab_size, size=(batch_size, sequence_length))
-    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
-    type_id_data = np.random.randint(
-        num_types, size=(batch_size, sequence_length))
-    outputs = model.predict([word_id_data, mask_data, type_id_data])
-    self.assertEqual(outputs[0].shape[1], out_seq_len)
-
-    # Creates a BertEncoder with max_sequence_length != sequence_length
-    max_sequence_length = 128
-    test_network = bert_encoder.BertEncoder(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types)
-    data, pooled = test_network([word_ids, mask, type_ids])
-    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
-    outputs = model.predict([word_id_data, mask_data, type_id_data])
-    self.assertEqual(outputs[0].shape[1], sequence_length)
-
-    # Creates a BertEncoder with embedding_width != hidden_size
-    test_network = bert_encoder.BertEncoder(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        embedding_width=16)
-    data, pooled = test_network([word_ids, mask, type_ids])
-    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
-    outputs = model.predict([word_id_data, mask_data, type_id_data])
-    self.assertEqual(outputs[0].shape[-1], hidden_size)
-    self.assertTrue(hasattr(test_network, "_embedding_projection"))
-
-  def test_serialize_deserialize(self):
-    # Create a network object that sets all of its config options.
-    kwargs = dict(
-        vocab_size=100,
-        hidden_size=32,
-        num_layers=3,
-        num_attention_heads=2,
-        max_sequence_length=21,
-        type_vocab_size=12,
-        intermediate_size=1223,
-        activation="relu",
-        dropout_rate=0.05,
-        attention_dropout_rate=0.22,
-        initializer="glorot_uniform",
-        return_all_encoder_outputs=False,
-        output_range=-1,
-        embedding_width=16,
-        dict_outputs=True,
-        embedding_layer=None)
-    network = bert_encoder.BertEncoder(**kwargs)
-    expected_config = dict(kwargs)
-    expected_config["activation"] = tf.keras.activations.serialize(
-        tf.keras.activations.get(expected_config["activation"]))
-    expected_config["initializer"] = tf.keras.initializers.serialize(
-        tf.keras.initializers.get(expected_config["initializer"]))
-
-    self.assertEqual(network.get_config(), expected_config)
-    # Create another network object from the first object's config.
-    new_network = bert_encoder.BertEncoder.from_config(network.get_config())
-
-    # Validate that the config can be forced to JSON.
-    _ = network.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(network.get_config(), new_network.get_config())
-
-    # Tests model saving/loading.
-    model_path = self.get_temp_dir() + "/model"
-    network.save(model_path)
-    _ = tf.keras.models.load_model(model_path)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/classification.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/classification.py
deleted file mode 100644
index 8ad16da629f2435a4fbb394758cf4cfec49a2b04..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/classification.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Classification and regression network."""
-# pylint: disable=g-classes-have-attributes
-import collections
-import tensorflow as tf
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class Classification(tf.keras.Model):
-  """Classification network head for BERT modeling.
-
-  This network implements a simple classifier head based on a dense layer. If
-  num_classes is one, it can be considered as a regression problem.
-
-  *Note* that the network is constructed by
-  [Keras Functional API](https://keras.io/guides/functional_api/).
-
-  Args:
-    input_width: The innermost dimension of the input tensor to this network.
-    num_classes: The number of classes that this network should classify to. If
-      equal to 1, a regression problem is assumed.
-    activation: The activation, if any, for the dense layer in this network.
-    initializer: The initializer for the dense layer in this network. Defaults
-      to a Glorot uniform initializer.
-    output: The output style for this network. Can be either `logits` or
-      `predictions`.
-  """
-
-  def __init__(self,
-               input_width,
-               num_classes,
-               initializer='glorot_uniform',
-               output='logits',
-               **kwargs):
-
-    cls_output = tf.keras.layers.Input(
-        shape=(input_width,), name='cls_output', dtype=tf.float32)
-
-    logits = tf.keras.layers.Dense(
-        num_classes,
-        activation=None,
-        kernel_initializer=initializer,
-        name='predictions/transform/logits')(
-            cls_output)
-
-    if output == 'logits':
-      output_tensors = logits
-    elif output == 'predictions':
-      policy = tf.keras.mixed_precision.global_policy()
-      if policy.name == 'mixed_bfloat16':
-        # b/158514794: bf16 is not stable with post-softmax cross-entropy.
-        policy = tf.float32
-      output_tensors = tf.keras.layers.Activation(
-          tf.nn.log_softmax, dtype=policy)(
-              logits)
-    else:
-      raise ValueError(
-          ('Unknown `output` value "%s". `output` can be either "logits" or '
-           '"predictions"') % output)
-
-    super(Classification, self).__init__(
-        inputs=[cls_output], outputs=output_tensors, **kwargs)
-
-    # b/164516224
-    # Once we've created the network using the Functional API, we call
-    # super().__init__ as though we were invoking the Functional API Model
-    # constructor, resulting in this object having all the properties of a model
-    # created using the Functional API. Once super().__init__ is called, we
-    # can assign attributes to `self` - note that all `self` assignments are
-    # below this line.
-    config_dict = {
-        'input_width': input_width,
-        'num_classes': num_classes,
-        'initializer': initializer,
-        'output': output,
-    }
-    # We are storing the config dict as a namedtuple here to ensure checkpoint
-    # compatibility with an earlier version of this model which did not track
-    # the config dict attribute. TF does not track immutable attrs which
-    # do not contain Trackables, so by creating a config namedtuple instead of
-    # a dict we avoid tracking it.
-    config_cls = collections.namedtuple('Config', config_dict.keys())
-    self._config = config_cls(**config_dict)
-    self.logits = logits
-
-  def get_config(self):
-    return dict(self._config._asdict())
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/classification_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/classification_test.py
deleted file mode 100644
index f98fc51330baaa06b1a3397e30c80eca426362d2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/classification_test.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for classification network."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.networks import classification
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class ClassificationTest(keras_parameterized.TestCase):
-
-  @parameterized.parameters(1, 10)
-  def test_network_creation(self, num_classes):
-    """Validate that the Keras object can be created."""
-    input_width = 512
-    test_object = classification.Classification(
-        input_width=input_width, num_classes=num_classes)
-    # Create a 2-dimensional input (the first dimension is implicit).
-    cls_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
-    output = test_object(cls_data)
-
-    # Validate that the outputs are of the expected shape.
-    expected_output_shape = [None, num_classes]
-    self.assertEqual(expected_output_shape, output.shape.as_list())
-
-  @parameterized.parameters(1, 10)
-  def test_network_invocation(self, num_classes):
-    """Validate that the Keras object can be invoked."""
-    input_width = 512
-    test_object = classification.Classification(
-        input_width=input_width, num_classes=num_classes, output='predictions')
-    # Create a 2-dimensional input (the first dimension is implicit).
-    cls_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
-    output = test_object(cls_data)
-
-    # Invoke the network as part of a Model.
-    model = tf.keras.Model(cls_data, output)
-    input_data = 10 * np.random.random_sample((3, input_width))
-    _ = model.predict(input_data)
-
-  def test_network_invocation_with_internal_logits(self):
-    """Validate that the logit outputs are correct."""
-    input_width = 512
-    num_classes = 10
-    test_object = classification.Classification(
-        input_width=input_width, num_classes=num_classes, output='predictions')
-
-    # Create a 2-dimensional input (the first dimension is implicit).
-    cls_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
-    output = test_object(cls_data)
-    model = tf.keras.Model(cls_data, output)
-    logits_model = tf.keras.Model(test_object.inputs, test_object.logits)
-
-    batch_size = 3
-    input_data = 10 * np.random.random_sample((batch_size, input_width))
-    outputs = model.predict(input_data)
-    logits = logits_model.predict(input_data)
-
-    # Ensure that the tensor shapes are correct.
-    expected_output_shape = (batch_size, num_classes)
-    self.assertEqual(expected_output_shape, outputs.shape)
-    self.assertEqual(expected_output_shape, logits.shape)
-
-    # Ensure that the logits, when softmaxed, create the outputs.
-    input_tensor = tf.keras.Input(expected_output_shape[1:])
-    output_tensor = tf.keras.layers.Activation(tf.nn.log_softmax)(input_tensor)
-    softmax_model = tf.keras.Model(input_tensor, output_tensor)
-
-    calculated_softmax = softmax_model.predict(logits)
-    self.assertAllClose(outputs, calculated_softmax)
-
-  @parameterized.parameters(1, 10)
-  def test_network_invocation_with_internal_and_external_logits(
-      self, num_classes):
-    """Validate that the logit outputs are correct."""
-    input_width = 512
-    test_object = classification.Classification(
-        input_width=input_width, num_classes=num_classes, output='logits')
-
-    # Create a 2-dimensional input (the first dimension is implicit).
-    cls_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
-    output = test_object(cls_data)
-    model = tf.keras.Model(cls_data, output)
-    logits_model = tf.keras.Model(test_object.inputs, test_object.logits)
-
-    batch_size = 3
-    input_data = 10 * np.random.random_sample((batch_size, input_width))
-    outputs = model.predict(input_data)
-    logits = logits_model.predict(input_data)
-
-    # Ensure that the tensor shapes are correct.
-    expected_output_shape = (batch_size, num_classes)
-    self.assertEqual(expected_output_shape, outputs.shape)
-    self.assertEqual(expected_output_shape, logits.shape)
-
-    self.assertAllClose(outputs, logits)
-
-  def test_network_invocation_with_logit_output(self):
-    """Validate that the logit outputs are correct."""
-    input_width = 512
-    num_classes = 10
-    test_object = classification.Classification(
-        input_width=input_width, num_classes=num_classes, output='predictions')
-    logit_object = classification.Classification(
-        input_width=input_width, num_classes=num_classes, output='logits')
-    logit_object.set_weights(test_object.get_weights())
-
-    # Create a 2-dimensional input (the first dimension is implicit).
-    cls_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
-    output = test_object(cls_data)
-    logit_output = logit_object(cls_data)
-
-    model = tf.keras.Model(cls_data, output)
-    logits_model = tf.keras.Model(cls_data, logit_output)
-
-    batch_size = 3
-    input_data = 10 * np.random.random_sample((batch_size, input_width))
-    outputs = model.predict(input_data)
-    logits = logits_model.predict(input_data)
-
-    # Ensure that the tensor shapes are correct.
-    expected_output_shape = (batch_size, num_classes)
-    self.assertEqual(expected_output_shape, outputs.shape)
-    self.assertEqual(expected_output_shape, logits.shape)
-
-    # Ensure that the logits, when softmaxed, create the outputs.
-    input_tensor = tf.keras.Input(expected_output_shape[1:])
-    output_tensor = tf.keras.layers.Activation(tf.nn.log_softmax)(input_tensor)
-    softmax_model = tf.keras.Model(input_tensor, output_tensor)
-
-    calculated_softmax = softmax_model.predict(logits)
-    self.assertAllClose(outputs, calculated_softmax)
-
-  def test_serialize_deserialize(self):
-    # Create a network object that sets all of its config options.
-    network = classification.Classification(
-        input_width=128,
-        num_classes=10,
-        initializer='zeros',
-        output='predictions')
-
-    # Create another network object from the first object's config.
-    new_network = classification.Classification.from_config(
-        network.get_config())
-
-    # Validate that the config can be forced to JSON.
-    _ = new_network.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(network.get_config(), new_network.get_config())
-
-  def test_unknown_output_type_fails(self):
-    with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
-      _ = classification.Classification(
-          input_width=128, num_classes=10, output='bad')
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/encoder_scaffold.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/encoder_scaffold.py
deleted file mode 100644
index 67182faa6bf3db7f70e13e0093e5b30b4059e2ea..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/encoder_scaffold.py
+++ /dev/null
@@ -1,374 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Transformer-based text encoder network."""
-# pylint: disable=g-classes-have-attributes
-import copy
-import inspect
-
-from absl import logging
-import gin
-import tensorflow as tf
-
-from official.nlp import keras_nlp
-from official.nlp.modeling import layers
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-@gin.configurable
-class EncoderScaffold(tf.keras.Model):
-  """Bi-directional Transformer-based encoder network scaffold.
-
-  This network allows users to flexibly implement an encoder similar to the one
-  described in "BERT: Pre-training of Deep Bidirectional Transformers for
-  Language Understanding" (https://arxiv.org/abs/1810.04805).
-
-  In this network, users can choose to provide a custom embedding subnetwork
-  (which will replace the standard embedding logic) and/or a custom hidden layer
-  class (which will replace the Transformer instantiation in the encoder). For
-  each of these custom injection points, users can pass either a class or a
-  class instance. If a class is passed, that class will be instantiated using
-  the `embedding_cfg` or `hidden_cfg` argument, respectively; if an instance
-  is passed, that instance will be invoked. (In the case of hidden_cls, the
-  instance will be invoked 'num_hidden_instances' times.
-
-  If the hidden_cls is not overridden, a default transformer layer will be
-  instantiated.
-
-  *Note* that the network is constructed by
-  [Keras Functional API](https://keras.io/guides/functional_api/).
-
-  Args:
-    pooled_output_dim: The dimension of pooled output.
-    pooler_layer_initializer: The initializer for the classification layer.
-    embedding_cls: The class or instance to use to embed the input data. This
-      class or instance defines the inputs to this encoder and outputs (1)
-      embeddings tensor with shape `(batch_size, seq_length, hidden_size)` and
-      (2) attention masking with tensor `(batch_size, seq_length, seq_length)`.
-      If `embedding_cls` is not set, a default embedding network (from the
-      original BERT paper) will be created.
-    embedding_cfg: A dict of kwargs to pass to the embedding_cls, if it needs to
-      be instantiated. If `embedding_cls` is not set, a config dict must be
-      passed to `embedding_cfg` with the following values:
-      `vocab_size`: The size of the token vocabulary.
-      `type_vocab_size`: The size of the type vocabulary.
-      `hidden_size`: The hidden size for this encoder.
-      `max_seq_length`: The maximum sequence length for this encoder.
-      `seq_length`: The sequence length for this encoder.
-      `initializer`: The initializer for the embedding portion of this encoder.
-      `dropout_rate`: The dropout rate to apply before the encoding layers.
-    embedding_data: A reference to the embedding weights that will be used to
-      train the masked language model, if necessary. This is optional, and only
-      needed if (1) you are overriding `embedding_cls` and (2) are doing
-      standard pretraining.
-    num_hidden_instances: The number of times to instantiate and/or invoke the
-      hidden_cls.
-    hidden_cls: The class or instance to encode the input data. If `hidden_cls`
-      is not set, a KerasBERT transformer layer will be used as the encoder
-      class.
-    hidden_cfg: A dict of kwargs to pass to the hidden_cls, if it needs to be
-      instantiated. If hidden_cls is not set, a config dict must be passed to
-      `hidden_cfg` with the following values:
-        `num_attention_heads`: The number of attention heads. The hidden size
-          must be divisible by `num_attention_heads`.
-        `intermediate_size`: The intermediate size of the transformer.
-        `intermediate_activation`: The activation to apply in the transfomer.
-        `dropout_rate`: The overall dropout rate for the transformer layers.
-        `attention_dropout_rate`: The dropout rate for the attention layers.
-        `kernel_initializer`: The initializer for the transformer layers.
-    mask_cls: The class to generate masks passed into hidden_cls() from inputs
-      and 2D mask indicating positions we can attend to. It is the caller's job
-      to make sure the output of the mask_layer can be used by hidden_layer.
-      A mask_cls is usually mapped to a hidden_cls.
-    mask_cfg: A dict of kwargs pass to mask_cls.
-    layer_norm_before_pooling: Whether to add a layer norm before the pooling
-      layer. You probably want to turn this on if you set `norm_first=True` in
-      transformer layers.
-    return_all_layer_outputs: Whether to output sequence embedding outputs of
-      all encoder transformer layers.
-    dict_outputs: Whether to use a dictionary as the model outputs.
-    layer_idx_as_attention_seed: Whether to include layer_idx in
-      attention_cfg in hidden_cfg.
-  """
-
-  def __init__(self,
-               pooled_output_dim,
-               pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-                   stddev=0.02),
-               embedding_cls=None,
-               embedding_cfg=None,
-               embedding_data=None,
-               num_hidden_instances=1,
-               hidden_cls=layers.Transformer,
-               hidden_cfg=None,
-               mask_cls=keras_nlp.layers.SelfAttentionMask,
-               mask_cfg=None,
-               layer_norm_before_pooling=False,
-               return_all_layer_outputs=False,
-               dict_outputs=False,
-               layer_idx_as_attention_seed=False,
-               **kwargs):
-
-    if embedding_cls:
-      if inspect.isclass(embedding_cls):
-        embedding_network = embedding_cls(
-            **embedding_cfg) if embedding_cfg else embedding_cls()
-      else:
-        embedding_network = embedding_cls
-      inputs = embedding_network.inputs
-      embeddings, attention_mask = embedding_network(inputs)
-      embedding_layer = None
-      position_embedding_layer = None
-      type_embedding_layer = None
-      embedding_norm_layer = None
-    else:
-      embedding_network = None
-      seq_length = embedding_cfg.get('seq_length', None)
-      word_ids = tf.keras.layers.Input(
-          shape=(seq_length,), dtype=tf.int32, name='input_word_ids')
-      mask = tf.keras.layers.Input(
-          shape=(seq_length,), dtype=tf.int32, name='input_mask')
-      type_ids = tf.keras.layers.Input(
-          shape=(seq_length,), dtype=tf.int32, name='input_type_ids')
-      inputs = [word_ids, mask, type_ids]
-
-      embedding_layer = keras_nlp.layers.OnDeviceEmbedding(
-          vocab_size=embedding_cfg['vocab_size'],
-          embedding_width=embedding_cfg['hidden_size'],
-          initializer=embedding_cfg['initializer'],
-          name='word_embeddings')
-
-      word_embeddings = embedding_layer(word_ids)
-
-      # Always uses dynamic slicing for simplicity.
-      position_embedding_layer = keras_nlp.layers.PositionEmbedding(
-          initializer=embedding_cfg['initializer'],
-          max_length=embedding_cfg['max_seq_length'],
-          name='position_embedding')
-      position_embeddings = position_embedding_layer(word_embeddings)
-
-      type_embedding_layer = keras_nlp.layers.OnDeviceEmbedding(
-          vocab_size=embedding_cfg['type_vocab_size'],
-          embedding_width=embedding_cfg['hidden_size'],
-          initializer=embedding_cfg['initializer'],
-          use_one_hot=True,
-          name='type_embeddings')
-      type_embeddings = type_embedding_layer(type_ids)
-
-      embeddings = tf.keras.layers.Add()(
-          [word_embeddings, position_embeddings, type_embeddings])
-
-      embedding_norm_layer = tf.keras.layers.LayerNormalization(
-          name='embeddings/layer_norm',
-          axis=-1,
-          epsilon=1e-12,
-          dtype=tf.float32)
-      embeddings = embedding_norm_layer(embeddings)
-
-      embeddings = (
-          tf.keras.layers.Dropout(
-              rate=embedding_cfg['dropout_rate'])(embeddings))
-
-      mask_cfg = {} if mask_cfg is None else mask_cfg
-      if inspect.isclass(mask_cls):
-        mask_layer = mask_cls(**mask_cfg)
-      else:
-        mask_layer = mask_cls
-      attention_mask = mask_layer(embeddings, mask)
-
-    data = embeddings
-
-    layer_output_data = []
-    hidden_layers = []
-    hidden_cfg = hidden_cfg if hidden_cfg else {}
-    for i in range(num_hidden_instances):
-      if inspect.isclass(hidden_cls):
-        if hidden_cfg and 'attention_cfg' in hidden_cfg and (
-            layer_idx_as_attention_seed):
-          hidden_cfg = copy.deepcopy(hidden_cfg)
-          hidden_cfg['attention_cfg']['seed'] = i
-        layer = hidden_cls(**hidden_cfg)
-      else:
-        layer = hidden_cls
-      data = layer([data, attention_mask])
-      layer_output_data.append(data)
-      hidden_layers.append(layer)
-
-    if layer_norm_before_pooling:
-      # Normalize the final output.
-      output_layer_norm = tf.keras.layers.LayerNormalization(
-          name='final_layer_norm',
-          axis=-1,
-          epsilon=1e-12)
-      layer_output_data[-1] = output_layer_norm(layer_output_data[-1])
-
-    last_layer_output = layer_output_data[-1]
-    # Applying a tf.slice op (through subscript notation) to a Keras tensor
-    # like this will create a SliceOpLambda layer. This is better than a Lambda
-    # layer with Python code, because that is fundamentally less portable.
-    first_token_tensor = last_layer_output[:, 0, :]
-    pooler_layer = tf.keras.layers.Dense(
-        units=pooled_output_dim,
-        activation='tanh',
-        kernel_initializer=pooler_layer_initializer,
-        name='cls_transform')
-    cls_output = pooler_layer(first_token_tensor)
-
-    if dict_outputs:
-      outputs = dict(
-          sequence_output=layer_output_data[-1],
-          pooled_output=cls_output,
-          encoder_outputs=layer_output_data,
-      )
-    elif return_all_layer_outputs:
-      outputs = [layer_output_data, cls_output]
-    else:
-      outputs = [layer_output_data[-1], cls_output]
-
-    # b/164516224
-    # Once we've created the network using the Functional API, we call
-    # super().__init__ as though we were invoking the Functional API Model
-    # constructor, resulting in this object having all the properties of a model
-    # created using the Functional API. Once super().__init__ is called, we
-    # can assign attributes to `self` - note that all `self` assignments are
-    # below this line.
-    super(EncoderScaffold, self).__init__(
-        inputs=inputs, outputs=outputs, **kwargs)
-
-    self._hidden_cls = hidden_cls
-    self._hidden_cfg = hidden_cfg
-    self._mask_cls = mask_cls
-    self._mask_cfg = mask_cfg
-    self._num_hidden_instances = num_hidden_instances
-    self._pooled_output_dim = pooled_output_dim
-    self._pooler_layer_initializer = pooler_layer_initializer
-    self._embedding_cls = embedding_cls
-    self._embedding_cfg = embedding_cfg
-    self._embedding_data = embedding_data
-    self._layer_norm_before_pooling = layer_norm_before_pooling
-    self._return_all_layer_outputs = return_all_layer_outputs
-    self._dict_outputs = dict_outputs
-    self._kwargs = kwargs
-
-    self._embedding_layer = embedding_layer
-    self._embedding_network = embedding_network
-    self._position_embedding_layer = position_embedding_layer
-    self._type_embedding_layer = type_embedding_layer
-    self._embedding_norm_layer = embedding_norm_layer
-    self._hidden_layers = hidden_layers
-    if self._layer_norm_before_pooling:
-      self._output_layer_norm = output_layer_norm
-    self._pooler_layer = pooler_layer
-    self._layer_idx_as_attention_seed = layer_idx_as_attention_seed
-
-    logging.info('EncoderScaffold configs: %s', self.get_config())
-
-  def get_config(self):
-    config_dict = {
-        'num_hidden_instances': self._num_hidden_instances,
-        'pooled_output_dim': self._pooled_output_dim,
-        'pooler_layer_initializer': self._pooler_layer_initializer,
-        'embedding_cls': self._embedding_network,
-        'embedding_cfg': self._embedding_cfg,
-        'layer_norm_before_pooling': self._layer_norm_before_pooling,
-        'return_all_layer_outputs': self._return_all_layer_outputs,
-        'dict_outputs': self._dict_outputs,
-        'layer_idx_as_attention_seed': self._layer_idx_as_attention_seed
-    }
-    cfgs = {
-        'hidden_cfg': self._hidden_cfg,
-        'mask_cfg': self._mask_cfg
-    }
-
-    for cfg_name, cfg in cfgs.items():
-      if cfg:
-        config_dict[cfg_name] = {}
-        for k, v in cfg.items():
-          # `self._hidden_cfg` may contain `class`, e.g., when `hidden_cfg` is
-          # `TransformerScaffold`, `attention_cls` argument can be a `class`.
-          if inspect.isclass(v):
-            config_dict[cfg_name][k] = tf.keras.utils.get_registered_name(v)
-          else:
-            config_dict[cfg_name][k] = v
-
-    clss = {
-        'hidden_cls': self._hidden_cls,
-        'mask_cls': self._mask_cls
-    }
-
-    for cls_name, cls in clss.items():
-      if inspect.isclass(cls):
-        key = '{}_string'.format(cls_name)
-        config_dict[key] = tf.keras.utils.get_registered_name(cls)
-      else:
-        config_dict[cls_name] = cls
-
-    config_dict.update(self._kwargs)
-    return config_dict
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    cls_names = ['hidden_cls', 'mask_cls']
-    for cls_name in cls_names:
-      cls_string = '{}_string'.format(cls_name)
-      if cls_string in config:
-        config[cls_name] = tf.keras.utils.get_registered_object(
-            config[cls_string], custom_objects=custom_objects)
-        del config[cls_string]
-    return cls(**config)
-
-  def get_embedding_table(self):
-    if self._embedding_network is None:
-      # In this case, we don't have a custom embedding network and can return
-      # the standard embedding data.
-      return self._embedding_layer.embeddings
-
-    if self._embedding_data is None:
-      raise RuntimeError(('The EncoderScaffold %s does not have a reference '
-                          'to the embedding data. This is required when you '
-                          'pass a custom embedding network to the scaffold. '
-                          'It is also possible that you are trying to get '
-                          'embedding data from an embedding scaffold with a '
-                          'custom embedding network where the scaffold has '
-                          'been serialized and deserialized. Unfortunately, '
-                          'accessing custom embedding references after '
-                          'serialization is not yet supported.') % self.name)
-    else:
-      return self._embedding_data
-
-  @property
-  def hidden_layers(self):
-    """List of hidden layers in the encoder."""
-    return self._hidden_layers
-
-  @property
-  def pooler_layer(self):
-    """The pooler dense layer after the transformer layers."""
-    return self._pooler_layer
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/encoder_scaffold_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/encoder_scaffold_test.py
deleted file mode 100644
index ab361fcb20289e39c758f6ac67a2f4cade021978..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/encoder_scaffold_test.py
+++ /dev/null
@@ -1,725 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for EncoderScaffold network."""
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.modeling import activations
-from official.nlp import keras_nlp
-from official.nlp.modeling import layers
-from official.nlp.modeling.networks import encoder_scaffold
-
-
-# Test class that wraps a standard transformer layer. If this layer is called
-# at any point, the list passed to the config object will be filled with a
-# boolean 'True'. We register this class as a Keras serializable so we can
-# test serialization below.
-@tf.keras.utils.register_keras_serializable(package="TestOnly")
-class ValidatedTransformerLayer(layers.Transformer):
-
-  def __init__(self, call_list, call_class=None, **kwargs):
-    super(ValidatedTransformerLayer, self).__init__(**kwargs)
-    self.list = call_list
-    self.call_class = call_class
-
-  def call(self, inputs):
-    self.list.append(True)
-    return super(ValidatedTransformerLayer, self).call(inputs)
-
-  def get_config(self):
-    config = super(ValidatedTransformerLayer, self).get_config()
-    config["call_list"] = self.list
-    config["call_class"] = tf.keras.utils.get_registered_name(self.call_class)
-    return config
-
-
-# Test class that wraps a standard self attention mask layer.
-# If this layer is called at any point, the list passed to the config
-# object will be filled with a
-# boolean 'True'. We register this class as a Keras serializable so we can
-# test serialization below.
-@tf.keras.utils.register_keras_serializable(package="TestOnly")
-class ValidatedMaskLayer(keras_nlp.layers.SelfAttentionMask):
-
-  def __init__(self, call_list, call_class=None, **kwargs):
-    super(ValidatedMaskLayer, self).__init__(**kwargs)
-    self.list = call_list
-    self.call_class = call_class
-
-  def call(self, inputs, mask):
-    self.list.append(True)
-    return super(ValidatedMaskLayer, self).call(inputs, mask)
-
-  def get_config(self):
-    config = super(ValidatedMaskLayer, self).get_config()
-    config["call_list"] = self.list
-    config["call_class"] = tf.keras.utils.get_registered_name(self.call_class)
-    return config
-
-
-@tf.keras.utils.register_keras_serializable(package="TestLayerOnly")
-class TestLayer(tf.keras.layers.Layer):
-  pass
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
-
-  def tearDown(self):
-    super(EncoderScaffoldLayerClassTest, self).tearDown()
-    tf.keras.mixed_precision.set_global_policy("float32")
-
-  @parameterized.named_parameters(
-      dict(testcase_name="only_final_output", return_all_layer_outputs=False),
-      dict(testcase_name="all_layer_outputs", return_all_layer_outputs=True))
-  def test_network_creation(self, return_all_layer_outputs):
-    hidden_size = 32
-    sequence_length = 21
-    num_hidden_instances = 3
-    embedding_cfg = {
-        "vocab_size": 100,
-        "type_vocab_size": 16,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
-    }
-
-    call_list = []
-    hidden_cfg = {
-        "num_attention_heads":
-            2,
-        "intermediate_size":
-            3072,
-        "intermediate_activation":
-            activations.gelu,
-        "dropout_rate":
-            0.1,
-        "attention_dropout_rate":
-            0.1,
-        "kernel_initializer":
-            tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "call_list":
-            call_list
-    }
-    mask_call_list = []
-    mask_cfg = {
-        "call_list":
-            mask_call_list
-    }
-    # Create a small EncoderScaffold for testing.
-    test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=num_hidden_instances,
-        pooled_output_dim=hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=0.02),
-        hidden_cls=ValidatedTransformerLayer,
-        hidden_cfg=hidden_cfg,
-        mask_cls=ValidatedMaskLayer,
-        mask_cfg=mask_cfg,
-        embedding_cfg=embedding_cfg,
-        layer_norm_before_pooling=True,
-        return_all_layer_outputs=return_all_layer_outputs)
-    # Create the inputs (note that the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    output_data, pooled = test_network([word_ids, mask, type_ids])
-
-    if return_all_layer_outputs:
-      self.assertIsInstance(output_data, list)
-      self.assertLen(output_data, num_hidden_instances)
-      data = output_data[-1]
-    else:
-      data = output_data
-    self.assertIsInstance(test_network.hidden_layers, list)
-    self.assertLen(test_network.hidden_layers, num_hidden_instances)
-    self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
-
-    expected_data_shape = [None, sequence_length, hidden_size]
-    expected_pooled_shape = [None, hidden_size]
-    self.assertAllEqual(expected_data_shape, data.shape.as_list())
-    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
-
-    # The default output dtype is float32.
-    self.assertAllEqual(tf.float32, data.dtype)
-    self.assertAllEqual(tf.float32, pooled.dtype)
-
-    # If call_list[0] exists and is True, the passed layer class was
-    # instantiated from the given config properly.
-    self.assertNotEmpty(call_list)
-    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
-
-    self.assertTrue(hasattr(test_network, "_output_layer_norm"))
-
-  def test_network_creation_with_float16_dtype(self):
-    tf.keras.mixed_precision.set_global_policy("mixed_float16")
-    hidden_size = 32
-    sequence_length = 21
-    embedding_cfg = {
-        "vocab_size": 100,
-        "type_vocab_size": 16,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
-    }
-    hidden_cfg = {
-        "num_attention_heads":
-            2,
-        "intermediate_size":
-            3072,
-        "intermediate_activation":
-            activations.gelu,
-        "dropout_rate":
-            0.1,
-        "attention_dropout_rate":
-            0.1,
-        "kernel_initializer":
-            tf.keras.initializers.TruncatedNormal(stddev=0.02),
-    }
-    # Create a small EncoderScaffold for testing.
-    test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        pooled_output_dim=hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=0.02),
-        hidden_cfg=hidden_cfg,
-        embedding_cfg=embedding_cfg)
-    # Create the inputs (note that the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    data, pooled = test_network([word_ids, mask, type_ids])
-
-    expected_data_shape = [None, sequence_length, hidden_size]
-    expected_pooled_shape = [None, hidden_size]
-    self.assertAllEqual(expected_data_shape, data.shape.as_list())
-    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
-
-    # If float_dtype is set to float16, the data output is float32 (from a layer
-    # norm) and pool output should be float16.
-    self.assertAllEqual(tf.float32, data.dtype)
-    self.assertAllEqual(tf.float16, pooled.dtype)
-
-  def test_network_invocation(self):
-    hidden_size = 32
-    sequence_length = 21
-    vocab_size = 57
-    num_types = 7
-    embedding_cfg = {
-        "vocab_size": vocab_size,
-        "type_vocab_size": num_types,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
-    }
-    hidden_cfg = {
-        "num_attention_heads":
-            2,
-        "intermediate_size":
-            3072,
-        "intermediate_activation":
-            activations.gelu,
-        "dropout_rate":
-            0.1,
-        "attention_dropout_rate":
-            0.1,
-        "kernel_initializer":
-            tf.keras.initializers.TruncatedNormal(stddev=0.02),
-    }
-    # Create a small EncoderScaffold for testing.
-    test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        pooled_output_dim=hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=0.02),
-        hidden_cfg=hidden_cfg,
-        embedding_cfg=embedding_cfg,
-        dict_outputs=True)
-
-    # Create the inputs (note that the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    outputs = test_network([word_ids, mask, type_ids])
-
-    # Create a model based off of this network:
-    model = tf.keras.Model([word_ids, mask, type_ids], outputs)
-
-    # Invoke the model. We can't validate the output data here (the model is too
-    # complex) but this will catch structural runtime errors.
-    batch_size = 3
-    word_id_data = np.random.randint(
-        vocab_size, size=(batch_size, sequence_length))
-    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
-    type_id_data = np.random.randint(
-        num_types, size=(batch_size, sequence_length))
-    preds = model.predict([word_id_data, mask_data, type_id_data])
-    self.assertEqual(preds["pooled_output"].shape, (3, hidden_size))
-
-    # Creates a EncoderScaffold with max_sequence_length != sequence_length
-    num_types = 7
-    embedding_cfg = {
-        "vocab_size": vocab_size,
-        "type_vocab_size": num_types,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length * 2,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
-    }
-    hidden_cfg = {
-        "num_attention_heads":
-            2,
-        "intermediate_size":
-            3072,
-        "intermediate_activation":
-            activations.gelu,
-        "dropout_rate":
-            0.1,
-        "attention_dropout_rate":
-            0.1,
-        "kernel_initializer":
-            tf.keras.initializers.TruncatedNormal(stddev=0.02),
-    }
-    # Create a small EncoderScaffold for testing.
-    test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        pooled_output_dim=hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=0.02),
-        hidden_cfg=hidden_cfg,
-        embedding_cfg=embedding_cfg)
-    outputs = test_network([word_ids, mask, type_ids])
-    model = tf.keras.Model([word_ids, mask, type_ids], outputs)
-    _ = model.predict([word_id_data, mask_data, type_id_data])
-
-  def test_serialize_deserialize(self):
-    # Create a network object that sets all of its config options.
-    hidden_size = 32
-    sequence_length = 21
-    embedding_cfg = {
-        "vocab_size": 100,
-        "type_vocab_size": 16,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
-    }
-    hidden_cfg = {
-        "num_attention_heads":
-            2,
-        "intermediate_size":
-            3072,
-        "intermediate_activation":
-            activations.gelu,
-        "dropout_rate":
-            0.1,
-        "attention_dropout_rate":
-            0.1,
-        "kernel_initializer":
-            tf.keras.initializers.TruncatedNormal(stddev=0.02),
-    }
-    # Create a small EncoderScaffold for testing.
-    network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        pooled_output_dim=hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=0.02),
-        hidden_cfg=hidden_cfg,
-        embedding_cfg=embedding_cfg)
-
-    # Create another network object from the first object's config.
-    new_network = encoder_scaffold.EncoderScaffold.from_config(
-        network.get_config())
-
-    # Validate that the config can be forced to JSON.
-    _ = new_network.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(network.get_config(), new_network.get_config())
-
-
-class Embeddings(tf.keras.Model):
-
-  def __init__(self, vocab_size, hidden_size):
-    super().__init__()
-    self.inputs = [
-        tf.keras.layers.Input(
-            shape=(None,), dtype=tf.int32, name="input_word_ids"),
-        tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_mask")
-    ]
-    self.attention_mask = layers.SelfAttentionMask()
-    self.embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=hidden_size,
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        name="word_embeddings")
-
-  def call(self, inputs):
-    word_ids, mask = inputs
-    word_embeddings = self.embedding_layer(word_ids)
-    return word_embeddings, self.attention_mask([word_embeddings, mask])
-
-
-@keras_parameterized.run_all_keras_modes
-class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
-
-  def test_network_invocation(self):
-    hidden_size = 32
-    sequence_length = 21
-    vocab_size = 57
-
-    # Build an embedding network to swap in for the default network. This one
-    # will have 2 inputs (mask and word_ids) instead of 3, and won't use
-    # positional embeddings.
-    network = Embeddings(vocab_size, hidden_size)
-
-    hidden_cfg = {
-        "num_attention_heads":
-            2,
-        "intermediate_size":
-            3072,
-        "intermediate_activation":
-            activations.gelu,
-        "dropout_rate":
-            0.1,
-        "attention_dropout_rate":
-            0.1,
-        "kernel_initializer":
-            tf.keras.initializers.TruncatedNormal(stddev=0.02),
-    }
-
-    # Create a small EncoderScaffold for testing.
-    test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        pooled_output_dim=hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=0.02),
-        hidden_cfg=hidden_cfg,
-        embedding_cls=network)
-
-    # Create the inputs (note that the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    data, pooled = test_network([word_ids, mask])
-
-    # Create a model based off of this network:
-    model = tf.keras.Model([word_ids, mask], [data, pooled])
-
-    # Invoke the model. We can't validate the output data here (the model is too
-    # complex) but this will catch structural runtime errors.
-    batch_size = 3
-    word_id_data = np.random.randint(
-        vocab_size, size=(batch_size, sequence_length))
-    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
-    _ = model.predict([word_id_data, mask_data])
-
-  def test_serialize_deserialize(self):
-    hidden_size = 32
-    sequence_length = 21
-    vocab_size = 57
-
-    # Build an embedding network to swap in for the default network. This one
-    # will have 2 inputs (mask and word_ids) instead of 3, and won't use
-    # positional embeddings.
-
-    word_ids = tf.keras.layers.Input(
-        shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
-    mask = tf.keras.layers.Input(
-        shape=(sequence_length,), dtype=tf.int32, name="input_mask")
-    embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=hidden_size,
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        name="word_embeddings")
-    word_embeddings = embedding_layer(word_ids)
-    attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])
-    network = tf.keras.Model([word_ids, mask],
-                             [word_embeddings, attention_mask])
-
-    hidden_cfg = {
-        "num_attention_heads":
-            2,
-        "intermediate_size":
-            3072,
-        "intermediate_activation":
-            activations.gelu,
-        "dropout_rate":
-            0.1,
-        "attention_dropout_rate":
-            0.1,
-        "kernel_initializer":
-            tf.keras.initializers.TruncatedNormal(stddev=0.02),
-    }
-
-    # Create a small EncoderScaffold for testing.
-    test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        pooled_output_dim=hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=0.02),
-        hidden_cfg=hidden_cfg,
-        embedding_cls=network,
-        embedding_data=embedding_layer.embeddings)
-
-    # Create another network object from the first object's config.
-    new_network = encoder_scaffold.EncoderScaffold.from_config(
-        test_network.get_config())
-
-    # Validate that the config can be forced to JSON.
-    _ = new_network.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(test_network.get_config(), new_network.get_config())
-
-    # Create a model based off of the old and new networks:
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-
-    data, pooled = new_network([word_ids, mask])
-    new_model = tf.keras.Model([word_ids, mask], [data, pooled])
-
-    data, pooled = test_network([word_ids, mask])
-    model = tf.keras.Model([word_ids, mask], [data, pooled])
-
-    # Copy the weights between models.
-    new_model.set_weights(model.get_weights())
-
-    # Invoke the models.
-    batch_size = 3
-    word_id_data = np.random.randint(
-        vocab_size, size=(batch_size, sequence_length))
-    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
-    data, cls = model.predict([word_id_data, mask_data])
-    new_data, new_cls = new_model.predict([word_id_data, mask_data])
-
-    # The output should be equal.
-    self.assertAllEqual(data, new_data)
-    self.assertAllEqual(cls, new_cls)
-
-    # We should not be able to get a reference to the embedding data.
-    with self.assertRaisesRegex(RuntimeError, ".*does not have a reference.*"):
-      new_network.get_embedding_table()
-
-
-@keras_parameterized.run_all_keras_modes
-class EncoderScaffoldHiddenInstanceTest(keras_parameterized.TestCase):
-
-  def test_network_invocation(self):
-    hidden_size = 32
-    sequence_length = 21
-    vocab_size = 57
-    num_types = 7
-
-    embedding_cfg = {
-        "vocab_size": vocab_size,
-        "type_vocab_size": num_types,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
-    }
-
-    call_list = []
-    hidden_cfg = {
-        "num_attention_heads":
-            2,
-        "intermediate_size":
-            3072,
-        "intermediate_activation":
-            activations.gelu,
-        "dropout_rate":
-            0.1,
-        "attention_dropout_rate":
-            0.1,
-        "kernel_initializer":
-            tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "call_list":
-            call_list
-    }
-    mask_call_list = []
-    mask_cfg = {
-        "call_list": mask_call_list
-    }
-    # Create a small EncoderScaffold for testing. This time, we pass an already-
-    # instantiated layer object.
-
-    xformer = ValidatedTransformerLayer(**hidden_cfg)
-    xmask = ValidatedMaskLayer(**mask_cfg)
-
-    test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        pooled_output_dim=hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=0.02),
-        hidden_cls=xformer,
-        mask_cls=xmask,
-        embedding_cfg=embedding_cfg)
-
-    # Create the inputs (note that the first dimension is implicit).
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    data, pooled = test_network([word_ids, mask, type_ids])
-
-    # Create a model based off of this network:
-    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
-
-    # Invoke the model. We can't validate the output data here (the model is too
-    # complex) but this will catch structural runtime errors.
-    batch_size = 3
-    word_id_data = np.random.randint(
-        vocab_size, size=(batch_size, sequence_length))
-    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
-    type_id_data = np.random.randint(
-        num_types, size=(batch_size, sequence_length))
-    _ = model.predict([word_id_data, mask_data, type_id_data])
-
-    # If call_list[0] exists and is True, the passed layer class was
-    # called as part of the graph creation.
-    self.assertNotEmpty(call_list)
-    self.assertTrue(call_list[0], "The passed layer class wasn't instantiated.")
-
-  @parameterized.parameters(True, False)
-  def test_serialize_deserialize(self, use_hidden_cls_instance):
-    hidden_size = 32
-    sequence_length = 21
-    vocab_size = 57
-    num_types = 7
-
-    embedding_cfg = {
-        "vocab_size": vocab_size,
-        "type_vocab_size": num_types,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
-    }
-
-    call_list = []
-    hidden_cfg = {
-        "num_attention_heads":
-            2,
-        "intermediate_size":
-            3072,
-        "intermediate_activation":
-            activations.gelu,
-        "dropout_rate":
-            0.1,
-        "attention_dropout_rate":
-            0.1,
-        "kernel_initializer":
-            tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "call_list":
-            call_list,
-        "call_class":
-            TestLayer
-    }
-    mask_call_list = []
-    mask_cfg = {"call_list": mask_call_list, "call_class": TestLayer}
-    # Create a small EncoderScaffold for testing. This time, we pass an already-
-    # instantiated layer object.
-    kwargs = dict(
-        num_hidden_instances=3,
-        pooled_output_dim=hidden_size,
-        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=0.02),
-        embedding_cfg=embedding_cfg)
-
-    if use_hidden_cls_instance:
-      xformer = ValidatedTransformerLayer(**hidden_cfg)
-      xmask = ValidatedMaskLayer(**mask_cfg)
-      test_network = encoder_scaffold.EncoderScaffold(
-          hidden_cls=xformer, mask_cls=xmask, **kwargs)
-    else:
-      test_network = encoder_scaffold.EncoderScaffold(
-          hidden_cls=ValidatedTransformerLayer,
-          hidden_cfg=hidden_cfg,
-          mask_cls=ValidatedMaskLayer,
-          mask_cfg=mask_cfg,
-          **kwargs)
-
-    # Create another network object from the first object's config.
-    new_network = encoder_scaffold.EncoderScaffold.from_config(
-        test_network.get_config())
-
-    # Validate that the config can be forced to JSON.
-    _ = new_network.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(test_network.get_config(), new_network.get_config())
-
-    # Create a model based off of the old and new networks:
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-
-    data, pooled = new_network([word_ids, mask, type_ids])
-    new_model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
-
-    data, pooled = test_network([word_ids, mask, type_ids])
-    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
-
-    # Copy the weights between models.
-    new_model.set_weights(model.get_weights())
-
-    # Invoke the models.
-    batch_size = 3
-    word_id_data = np.random.randint(
-        vocab_size, size=(batch_size, sequence_length))
-    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
-    type_id_data = np.random.randint(
-        num_types, size=(batch_size, sequence_length))
-    data, cls = model.predict([word_id_data, mask_data, type_id_data])
-    new_data, new_cls = new_model.predict(
-        [word_id_data, mask_data, type_id_data])
-
-    # The output should be equal.
-    self.assertAllEqual(data, new_data)
-    self.assertAllEqual(cls, new_cls)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/mobile_bert_encoder.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/mobile_bert_encoder.py
deleted file mode 100644
index 1a87998ed83d1278ff9c3b0051d0ddb1dbe82c53..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/mobile_bert_encoder.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""MobileBERT text encoder network."""
-import gin
-import tensorflow as tf
-
-from official.nlp import keras_nlp
-from official.nlp.modeling import layers
-
-
-@gin.configurable
-class MobileBERTEncoder(tf.keras.Model):
-  """A Keras functional API implementation for MobileBERT encoder."""
-
-  def __init__(self,
-               word_vocab_size=30522,
-               word_embed_size=128,
-               type_vocab_size=2,
-               max_sequence_length=512,
-               num_blocks=24,
-               hidden_size=512,
-               num_attention_heads=4,
-               intermediate_size=512,
-               intermediate_act_fn='relu',
-               hidden_dropout_prob=0.1,
-               attention_probs_dropout_prob=0.1,
-               intra_bottleneck_size=128,
-               initializer_range=0.02,
-               use_bottleneck_attention=False,
-               key_query_shared_bottleneck=True,
-               num_feedforward_networks=4,
-               normalization_type='no_norm',
-               classifier_activation=False,
-               input_mask_dtype='int32',
-               **kwargs):
-    """Class initialization.
-
-    Args:
-      word_vocab_size: Number of words in the vocabulary.
-      word_embed_size: Word embedding size.
-      type_vocab_size: Number of word types.
-      max_sequence_length: Maximum length of input sequence.
-      num_blocks: Number of transformer block in the encoder model.
-      hidden_size: Hidden size for the transformer block.
-      num_attention_heads: Number of attention heads in the transformer block.
-      intermediate_size: The size of the "intermediate" (a.k.a., feed
-        forward) layer.
-      intermediate_act_fn: The non-linear activation function to apply
-        to the output of the intermediate/feed-forward layer.
-      hidden_dropout_prob: Dropout probability for the hidden layers.
-      attention_probs_dropout_prob: Dropout probability of the attention
-        probabilities.
-      intra_bottleneck_size: Size of bottleneck.
-      initializer_range: The stddev of the `truncated_normal_initializer` for
-        initializing all weight matrices.
-      use_bottleneck_attention: Use attention inputs from the bottleneck
-        transformation. If true, the following `key_query_shared_bottleneck`
-        will be ignored.
-      key_query_shared_bottleneck: Whether to share linear transformation for
-        keys and queries.
-      num_feedforward_networks: Number of stacked feed-forward networks.
-      normalization_type: The type of normalization_type, only `no_norm` and
-        `layer_norm` are supported. `no_norm` represents the element-wise linear
-        transformation for the student model, as suggested by the original
-        MobileBERT paper. `layer_norm` is used for the teacher model.
-      classifier_activation: If using the tanh activation for the final
-        representation of the `[CLS]` token in fine-tuning.
-      input_mask_dtype: The dtype of `input_mask` tensor, which is one of the
-        input tensors of this encoder. Defaults to `int32`. If you want
-        to use `tf.lite` quantization, which does not support `Cast` op,
-        please set this argument to `tf.float32` and feed `input_mask`
-        tensor with values in `float32` to avoid `tf.cast` in the computation.
-      **kwargs: Other keyworded and arguments.
-    """
-    self._self_setattr_tracking = False
-    initializer = tf.keras.initializers.TruncatedNormal(
-        stddev=initializer_range)
-
-    # layer instantiation
-    self.embedding_layer = layers.MobileBertEmbedding(
-        word_vocab_size=word_vocab_size,
-        word_embed_size=word_embed_size,
-        type_vocab_size=type_vocab_size,
-        output_embed_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        normalization_type=normalization_type,
-        initializer=initializer,
-        dropout_rate=hidden_dropout_prob)
-
-    self._transformer_layers = []
-    for layer_idx in range(num_blocks):
-      transformer = layers.MobileBertTransformer(
-          hidden_size=hidden_size,
-          num_attention_heads=num_attention_heads,
-          intermediate_size=intermediate_size,
-          intermediate_act_fn=intermediate_act_fn,
-          hidden_dropout_prob=hidden_dropout_prob,
-          attention_probs_dropout_prob=attention_probs_dropout_prob,
-          intra_bottleneck_size=intra_bottleneck_size,
-          use_bottleneck_attention=use_bottleneck_attention,
-          key_query_shared_bottleneck=key_query_shared_bottleneck,
-          num_feedforward_networks=num_feedforward_networks,
-          normalization_type=normalization_type,
-          initializer=initializer,
-          name=f'transformer_layer_{layer_idx}')
-      self._transformer_layers.append(transformer)
-
-    # input tensor
-    input_ids = tf.keras.layers.Input(
-        shape=(None,), dtype=tf.int32, name='input_word_ids')
-    input_mask = tf.keras.layers.Input(
-        shape=(None,), dtype=input_mask_dtype, name='input_mask')
-    type_ids = tf.keras.layers.Input(
-        shape=(None,), dtype=tf.int32, name='input_type_ids')
-    self.inputs = [input_ids, input_mask, type_ids]
-
-    # The dtype of `attention_mask` will the same as the dtype of `input_mask`.
-    attention_mask = keras_nlp.layers.SelfAttentionMask()(input_mask,
-                                                          input_mask)
-
-    # build the computation graph
-    all_layer_outputs = []
-    all_attention_scores = []
-    embedding_output = self.embedding_layer(input_ids, type_ids)
-    all_layer_outputs.append(embedding_output)
-    prev_output = embedding_output
-
-    for layer_idx in range(num_blocks):
-      layer_output, attention_score = self._transformer_layers[layer_idx](
-          prev_output,
-          attention_mask,
-          return_attention_scores=True)
-      all_layer_outputs.append(layer_output)
-      all_attention_scores.append(attention_score)
-      prev_output = layer_output
-    first_token = tf.squeeze(prev_output[:, 0:1, :], axis=1)
-
-    if classifier_activation:
-      self._pooler_layer = tf.keras.layers.experimental.EinsumDense(
-          'ab,bc->ac',
-          output_shape=hidden_size,
-          activation=tf.tanh,
-          bias_axes='c',
-          kernel_initializer=initializer,
-          name='pooler')
-      first_token = self._pooler_layer(first_token)
-    else:
-      self._pooler_layer = None
-
-    outputs = dict(
-        sequence_output=prev_output,
-        pooled_output=first_token,
-        encoder_outputs=all_layer_outputs,
-        attention_scores=all_attention_scores)
-
-    super(MobileBERTEncoder, self).__init__(
-        inputs=self.inputs, outputs=outputs, **kwargs)
-
-  def get_embedding_table(self):
-    return self.embedding_layer.word_embedding.embeddings
-
-  def get_embedding_layer(self):
-    return self.embedding_layer.word_embedding
-
-  @property
-  def transformer_layers(self):
-    """List of Transformer layers in the encoder."""
-    return self._transformer_layers
-
-  @property
-  def pooler_layer(self):
-    """The pooler dense layer after the transformer layers."""
-    return self._pooler_layer
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/mobile_bert_encoder_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/mobile_bert_encoder_test.py
deleted file mode 100644
index cb19de4c57e64d4dcd80ec83a32c40b0e549aad3..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/mobile_bert_encoder_test.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from absl.testing import parameterized
-
-import numpy as np
-import tensorflow as tf
-from official.nlp.modeling import models
-from official.nlp.modeling.networks import mobile_bert_encoder
-
-
-def generate_fake_input(batch_size=1, seq_len=5, vocab_size=10000, seed=0):
-  """Generate consistent fake integer input sequences."""
-  np.random.seed(seed)
-  fake_input = []
-  for _ in range(batch_size):
-    fake_input.append([])
-    for _ in range(seq_len):
-      fake_input[-1].append(np.random.randint(0, vocab_size))
-  fake_input = np.asarray(fake_input)
-  return fake_input
-
-
-class MobileBertEncoderTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.named_parameters(
-      ('default_setting', 'relu', True, 'no_norm', False),
-      ('gelu', 'gelu', True, 'no_norm', False),
-      ('kq_not_shared', 'relu', False, 'no_norm', False),
-      ('layer_norm', 'relu', True, 'layer_norm', False),
-      ('use_pooler', 'relu', True, 'no_norm', True),
-      ('with_pooler_layer', 'relu', True, 'layer_norm', False))
-  def test_mobilebert_encoder(self, act_fn, kq_shared_bottleneck,
-                              normalization_type, use_pooler):
-    hidden_size = 32
-    sequence_length = 16
-    num_blocks = 3
-    test_network = mobile_bert_encoder.MobileBERTEncoder(
-        word_vocab_size=100,
-        hidden_size=hidden_size,
-        num_blocks=num_blocks,
-        intermediate_act_fn=act_fn,
-        key_query_shared_bottleneck=kq_shared_bottleneck,
-        normalization_type=normalization_type,
-        classifier_activation=use_pooler)
-
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    outputs = test_network([word_ids, mask, type_ids])
-    layer_output, pooler_output = outputs['sequence_output'], outputs[
-        'pooled_output']
-
-    self.assertIsInstance(test_network.transformer_layers, list)
-    self.assertLen(test_network.transformer_layers, num_blocks)
-
-    layer_output_shape = [None, sequence_length, hidden_size]
-    self.assertAllEqual(layer_output.shape.as_list(), layer_output_shape)
-    pooler_output_shape = [None, hidden_size]
-    self.assertAllEqual(pooler_output.shape.as_list(), pooler_output_shape)
-    self.assertAllEqual(tf.float32, layer_output.dtype)
-
-  def test_mobilebert_encoder_return_all_layer_output(self):
-    hidden_size = 32
-    sequence_length = 16
-    num_blocks = 3
-    test_network = mobile_bert_encoder.MobileBERTEncoder(
-        word_vocab_size=100,
-        hidden_size=hidden_size,
-        num_blocks=num_blocks)
-
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    outputs = test_network([word_ids, mask, type_ids])
-    all_layer_output = outputs['encoder_outputs']
-
-    self.assertIsInstance(all_layer_output, list)
-    self.assertLen(all_layer_output, num_blocks + 1)
-
-  @parameterized.parameters('int32', 'float32')
-  def test_mobilebert_encoder_invocation(self, input_mask_dtype):
-    vocab_size = 100
-    hidden_size = 32
-    sequence_length = 16
-    num_blocks = 3
-    test_network = mobile_bert_encoder.MobileBERTEncoder(
-        word_vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        num_blocks=num_blocks,
-        input_mask_dtype=input_mask_dtype)
-
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=input_mask_dtype)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    outputs = test_network([word_ids, mask, type_ids])
-    model = tf.keras.Model([word_ids, mask, type_ids], outputs)
-
-    input_seq = generate_fake_input(
-        batch_size=1, seq_len=sequence_length, vocab_size=vocab_size)
-    input_mask = generate_fake_input(
-        batch_size=1, seq_len=sequence_length, vocab_size=2)
-    token_type = generate_fake_input(
-        batch_size=1, seq_len=sequence_length, vocab_size=2)
-    outputs = model.predict([input_seq, input_mask, token_type])
-
-    sequence_output_shape = [1, sequence_length, hidden_size]
-    self.assertAllEqual(outputs['sequence_output'].shape, sequence_output_shape)
-    pooled_output_shape = [1, hidden_size]
-    self.assertAllEqual(outputs['pooled_output'].shape, pooled_output_shape)
-
-  def test_mobilebert_encoder_invocation_with_attention_score(self):
-    vocab_size = 100
-    hidden_size = 32
-    sequence_length = 16
-    num_blocks = 3
-    test_network = mobile_bert_encoder.MobileBERTEncoder(
-        word_vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        num_blocks=num_blocks)
-
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    outputs = test_network([word_ids, mask, type_ids])
-    model = tf.keras.Model([word_ids, mask, type_ids], outputs)
-
-    input_seq = generate_fake_input(
-        batch_size=1, seq_len=sequence_length, vocab_size=vocab_size)
-    input_mask = generate_fake_input(
-        batch_size=1, seq_len=sequence_length, vocab_size=2)
-    token_type = generate_fake_input(
-        batch_size=1, seq_len=sequence_length, vocab_size=2)
-    outputs = model.predict([input_seq, input_mask, token_type])
-    self.assertLen(outputs['attention_scores'], num_blocks)
-
-  @parameterized.named_parameters(
-      ('sequence_classification', models.BertClassifier, [None, 5]),
-      ('token_classification', models.BertTokenClassifier, [None, 16, 5]))
-  def test_mobilebert_encoder_for_downstream_task(self, task, prediction_shape):
-    hidden_size = 32
-    sequence_length = 16
-    mobilebert_encoder = mobile_bert_encoder.MobileBERTEncoder(
-        word_vocab_size=100, hidden_size=hidden_size)
-    num_classes = 5
-    classifier = task(network=mobilebert_encoder, num_classes=num_classes)
-
-    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
-    prediction = classifier([word_ids, mask, type_ids])
-    if task == models.BertTokenClassifier:
-      prediction = prediction['logits']
-    self.assertAllEqual(prediction.shape.as_list(), prediction_shape)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/packed_sequence_embedding.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/packed_sequence_embedding.py
deleted file mode 100644
index 47ae8864dac412b48f57d9b07fd6ee2a4892b25b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/packed_sequence_embedding.py
+++ /dev/null
@@ -1,335 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""An embedding network supporting packed sequences and position ids."""
-# pylint: disable=g-classes-have-attributes
-import collections
-import tensorflow as tf
-
-from official.modeling import tf_utils
-from official.nlp import keras_nlp
-from official.nlp.modeling import layers
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class PackedSequenceEmbedding(tf.keras.Model):
-  """An embedding network supporting packed sequences and position ids.
-
-  This network implements an embedding layer similar to the one described in
-  "BERT: Pre-training of Deep Bidirectional Transformers for Language
-  Understanding" (https://arxiv.org/abs/1810.04805). On top of it, it supports
-  to (1) pack multiple sequences into one sequence and (2) allow additional
-  "position_ids" as input.
-
-  Args:
-    vocab_size: The size of the token vocabulary.
-    type_vocab_size: The size of the type vocabulary.
-    embedding_width: Width of token embeddings.
-    hidden_size: The output size for this encoder.
-    max_seq_length: The maximum sequence length for this encoder.
-    initializer: The initializer for the embedding portion of this encoder.
-    dropout_rate: The dropout rate to apply before the encoding layers.
-    pack_multiple_sequences: If `True`, we can feed multiple sequences into one
-      sequence for training and inference (they don't impact each other).
-    use_position_id: Whether to expect `position_ids` as an input to the
-      network. If False, the `position_ids` will be inferred: (1) when
-        pack_multiple_sequences is False, we assume the position ids are `0, 1,
-        2, ..., seq_length - 1`; (2) when `pack_multiple_sequences` is `True`,
-        there may be multiple sub sequences, and for each sub sequence, its
-        position ids start from 0, 1, 2, ...
-  """
-
-  def __init__(self,
-               vocab_size,
-               type_vocab_size,
-               embedding_width,
-               hidden_size,
-               max_seq_length,
-               initializer,
-               dropout_rate,
-               use_position_id=False,
-               pack_multiple_sequences=False,
-               **kwargs):
-    initializer = tf.keras.initializers.get(initializer)
-    config_dict = {
-        'vocab_size': vocab_size,
-        'type_vocab_size': type_vocab_size,
-        'embedding_width': embedding_width,
-        'hidden_size': hidden_size,
-        'max_seq_length': max_seq_length,
-        'initializer': tf.keras.initializers.serialize(initializer),
-        'dropout_rate': dropout_rate,
-        'use_position_id': use_position_id,
-        'pack_multiple_sequences': pack_multiple_sequences,
-    }
-
-    word_ids = tf.keras.layers.Input(
-        shape=(None,), dtype=tf.int32, name='input_word_ids')
-    mask = tf.keras.layers.Input(
-        shape=(None,), dtype=tf.int32, name='input_mask')
-    type_ids = tf.keras.layers.Input(
-        shape=(None,), dtype=tf.int32, name='input_type_ids')
-    inputs = {
-        'input_word_ids': word_ids,
-        'input_mask': mask,
-        'input_type_ids': type_ids,
-    }
-    if use_position_id:
-      position_ids = tf.keras.layers.Input(
-          shape=(None,), dtype=tf.int32, name='position_ids')
-      inputs['position_ids'] = position_ids
-    else:
-      position_ids = None
-
-    if pack_multiple_sequences:
-      sub_seq_mask = PackedSequenceMask()(word_ids)
-    else:
-      sub_seq_mask = None
-
-    embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=embedding_width,
-        initializer=initializer,
-        name='word_embeddings')
-    word_embeddings = embedding_layer(word_ids)
-
-    # Always uses dynamic slicing for simplicity.
-    position_embedding_layer = PositionEmbeddingWithSubSeqMask(
-        initializer=initializer,
-        use_dynamic_slicing=True,
-        max_sequence_length=max_seq_length,
-        name='position_embedding')
-    position_embeddings = position_embedding_layer(
-        word_embeddings, position_ids, sub_seq_mask)
-
-    type_embeddings = (
-        layers.OnDeviceEmbedding(
-            vocab_size=type_vocab_size,
-            embedding_width=embedding_width,
-            initializer=initializer,
-            use_one_hot=True,
-            name='type_embeddings')(type_ids))
-
-    embeddings = tf.keras.layers.Add()(
-        [word_embeddings, position_embeddings, type_embeddings])
-    embeddings = tf.keras.layers.LayerNormalization(
-        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)(
-            embeddings)
-    embeddings = tf.keras.layers.Dropout(
-        rate=dropout_rate, dtype=tf.float32)(
-            embeddings)
-
-    if embedding_width != hidden_size:
-      embeddings = tf.keras.layers.experimental.EinsumDense(
-          '...x,xy->...y',
-          output_shape=hidden_size,
-          bias_axes=None,
-          kernel_initializer=initializer,
-          name='embedding_projection')(
-              embeddings)
-
-    attention_mask = keras_nlp.layers.SelfAttentionMask()(embeddings, mask)
-    if sub_seq_mask is not None:
-      attention_mask = tf.keras.layers.Lambda(
-          lambda x: x[0] * tf.cast(x[1], x[0].dtype))(
-              [attention_mask, sub_seq_mask])
-
-    outputs = [embeddings, attention_mask]
-    super(PackedSequenceEmbedding, self).__init__(
-        inputs=inputs, outputs=outputs, **kwargs)
-    # TF does not track immutable attrs which do not contain Trackables,
-    # so by creating a config namedtuple instead of a dict we avoid tracking it.
-    config_cls = collections.namedtuple('Config', config_dict.keys())
-    self._config = config_cls(**config_dict)
-    self._embedding_layer = embedding_layer
-    self._position_embedding_layer = position_embedding_layer
-
-  def get_embedding_table(self):
-    return self._embedding_layer.embeddings
-
-  def get_config(self):
-    return dict(self._config._asdict())
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class PackedSequenceMask(tf.keras.layers.Layer):
-  """A layer to create a mask to indicate multiple sub sequences."""
-
-  def call(self, input_ids):
-    """Implements call() for the layer.
-
-    Args:
-      input_ids: int32 Tensor of shape [batch_size, seq_length].
-
-    Returns:
-      boolean Tensor of shape [batch_size, seq_length, seq_length]. [x, y, z]
-      is True if for x'th instance in a batch, y'th token and z'th token are
-      from the same sub sequence.
-    """
-    # Suppose
-    # - the first token in the parent sequence is [CLS].
-    # - every sequence starts from [CLS].
-    # - every sequence only contains one [CLS].
-    seq_start_token = input_ids[:, 0:1]
-    seq_start_loc = tf.cast(tf.equal(input_ids, seq_start_token), tf.int32)
-    # Set different ids for different sub sequences.
-    seq_ids = tf.expand_dims(tf.cumsum(seq_start_loc, -1), -1)
-    return tf.equal(seq_ids, tf.transpose(seq_ids, [0, 2, 1]))
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class PositionEmbeddingWithSubSeqMask(tf.keras.layers.Layer):
-  """Creates a positional embedding with sub-sequence masking.
-
-  This layer creates a positional embedding as described in "BERT: Pre-training
-  of Deep Bidirectional Transformers for Language Understanding"
-  (https://arxiv.org/abs/1810.04805). On top of it, it supports
-  `position_ids` and `sub_sequence_mask` tensors.
-
-  This layer can be set up to either create a statically shaped slice or a
-  dynamically shaped slice. If `use_dynamic_slicing` is True, the input tensor
-  can have a dynamic 1st dimension, while if `use_dynamic_slicing` is False the
-  input size must be fixed.
-
-  Args:
-    initializer: The initializer to use for the embedding weights. Defaults to
-      "glorot_uniform".
-    use_dynamic_slicing: Whether to use the dynamic slicing path.
-    max_sequence_length: The maximum size of the dynamic sequence. Only
-      applicable if `use_dynamic_slicing` is True.
-  """
-
-  def __init__(self,
-               initializer='glorot_uniform',
-               use_dynamic_slicing=False,
-               max_sequence_length=None,
-               **kwargs):
-    # We need to have a default dtype of float32, since the inputs (which Keras
-    # usually uses to infer the dtype) will always be int32.
-    if 'dtype' not in kwargs:
-      kwargs['dtype'] = 'float32'
-
-    super(PositionEmbeddingWithSubSeqMask, self).__init__(**kwargs)
-    if use_dynamic_slicing and max_sequence_length is None:
-      raise ValueError(
-          'If `use_dynamic_slicing` is True, `max_sequence_length` must be set.'
-      )
-    self._max_sequence_length = max_sequence_length
-    self._initializer = tf.keras.initializers.get(initializer)
-    self._use_dynamic_slicing = use_dynamic_slicing
-
-  def get_config(self):
-    config = {
-        'max_sequence_length': self._max_sequence_length,
-        'initializer': tf.keras.initializers.serialize(self._initializer),
-        'use_dynamic_slicing': self._use_dynamic_slicing,
-    }
-    base_config = super(PositionEmbeddingWithSubSeqMask, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def build(self, input_shape):
-    """Implements build() for the layer."""
-    dimension_list = input_shape.as_list()
-
-    if len(dimension_list) != 3:
-      raise ValueError('PositionEmbedding expects a 3-dimensional input tensor '
-                       'of shape [batch, sequence, width]')
-    seq_length = dimension_list[1]
-    width = dimension_list[2]
-
-    # If we are not using dynamic slicing, we must assume that the sequence
-    # length is fixed and max_sequence_length should not be specified.
-    if not self._use_dynamic_slicing:
-      if seq_length is None:
-        raise ValueError(
-            'PositionEmbedding must have `use_dynamic_slicing` set '
-            'to True (and max_sequence_length set) when the '
-            'sequence (1st) dimension of the input is None.')
-      if self._max_sequence_length is not None:
-        raise ValueError(
-            'When `use_dynamic_slicing` is False, max_sequence_length should '
-            'not be specified and we ought to use seq_length to get the '
-            'variable shape.')
-
-    if self._max_sequence_length is not None:
-      weight_sequence_length = self._max_sequence_length
-    else:
-      weight_sequence_length = seq_length
-
-    self._position_embeddings = self.add_weight(
-        'embeddings',
-        shape=[weight_sequence_length, width],
-        initializer=self._initializer)
-
-    super(PositionEmbeddingWithSubSeqMask, self).build(input_shape)
-
-  def call(self, inputs, position_ids=None, sub_sequence_mask=None):
-    """Implements call() for the layer.
-
-    When `position_ids` is specified, it will return the position embeddings
-    corresponding to this `position_ids`; otherwise, `position_ids` will be
-    inferred in the following way:
-
-    (1) When `sub_sequence_mask` is None, we assume the position ids are
-        0, 1, 2, ..., seq_length - 1.
-    (2) When `sub_sequence_mask` is specified, there may be multiple sub
-        sequences, and for each sub sequence, its position ids start from
-        0, 1, 2, ...
-
-    Args:
-      inputs: Word embeddings in shape [batch, seq_length, embedding_dim].
-      position_ids: An optional int32 tensor in shape [batch, seq_length].
-      sub_sequence_mask: An optional bool tensor in shape [batch, seq_length,
-        seq_length]. [x, y, z] is True if for x'th instance in a batch, y'th
-        token and z'th token are from the same sub sequence.
-
-    Returns:
-      The position embeddings in shape [batch, seq_length, embedding_dim].
-    """
-    input_shape = tf_utils.get_shape_list(inputs, expected_rank=3)
-    if self._use_dynamic_slicing:
-      position_embeddings = self._position_embeddings[:input_shape[1], :]
-    else:
-      position_embeddings = self._position_embeddings
-
-    if position_ids is not None:
-      return tf.gather(position_embeddings, position_ids)
-
-    if sub_sequence_mask is None:
-      return tf.broadcast_to(position_embeddings, input_shape)
-    else:
-      sub_sequence_mask = tf.cast(sub_sequence_mask, tf.int32)
-      # For each sub sequence, its position ids start from 0, 1, 2, ...
-      position_ids = tf.linalg.diag_part(tf.cumsum(sub_sequence_mask, -1)) - 1
-      return tf.gather(position_embeddings, position_ids)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/packed_sequence_embedding_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/packed_sequence_embedding_test.py
deleted file mode 100644
index 1c1959f3c5eca9869d52076a337e4383755ba173..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/packed_sequence_embedding_test.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.modeling.networks.packed_sequence_embedding."""
-
-# Import libraries
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.modeling.networks import packed_sequence_embedding
-
-
-class PackedSequenceEmbeddingTest(tf.test.TestCase, parameterized.TestCase):
-
-  def tearDown(self):
-    super(PackedSequenceEmbeddingTest, self).tearDown()
-    tf.keras.mixed_precision.set_global_policy('float32')
-
-  @parameterized.parameters([
-      (True, True, True),
-      (False, False, True),
-      (False, True, False),
-      (True, False, False),
-  ])
-  def test_network_creation(self, use_position_id, pack_multiple_sequences,
-                            use_float16):
-    """Validate that the Keras object can be created."""
-    if use_float16:
-      tf.keras.mixed_precision.set_global_policy('mixed_float16')
-    seq_length = 16
-    vocab_size = 100
-    max_position_embeddings = 32
-    type_vocab_size = 2
-    embedding_width = 16
-    hidden_size = 32
-    embedding_cfg = dict(
-        vocab_size=vocab_size,
-        type_vocab_size=2,
-        embedding_width=embedding_width,
-        hidden_size=hidden_size,
-        max_seq_length=max_position_embeddings,
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        dropout_rate=0.1,
-        use_position_id=use_position_id,
-        pack_multiple_sequences=pack_multiple_sequences,
-    )
-    test_object = packed_sequence_embedding.PackedSequenceEmbedding(
-        **embedding_cfg)
-
-    input_word_ids = tf.keras.Input(shape=(seq_length,), dtype=tf.int32)
-    input_mask = tf.keras.Input(shape=(seq_length,), dtype=tf.int32)
-    input_type_ids = tf.keras.Input(shape=(seq_length,), dtype=tf.int32)
-    network_inputs = {
-        'input_word_ids': input_word_ids,
-        'input_mask': input_mask,
-        'input_type_ids': input_type_ids,
-    }
-    if use_position_id:
-      network_inputs['position_ids'] = tf.keras.Input(
-          shape=(seq_length,), dtype=tf.int32)
-
-    embedding, mask = test_object(network_inputs)
-
-    # Create a model based off of this network:
-    model = tf.keras.Model(network_inputs, [embedding, mask])
-
-    # Invoke the model. We can't validate the output data here (the model is too
-    # complex) but this will catch structural runtime errors.
-    batch_size = 3
-    word_id_data = np.random.randint(vocab_size, size=(batch_size, seq_length))
-    mask_data = np.random.randint(2, size=(batch_size, seq_length))
-    type_id_data = np.random.randint(
-        type_vocab_size, size=(batch_size, seq_length))
-    feed_input = {
-        'input_word_ids': word_id_data,
-        'input_mask': mask_data,
-        'input_type_ids': type_id_data,
-    }
-    if use_position_id:
-      feed_input['position_ids'] = np.random.randint(
-          seq_length, size=(batch_size, seq_length))
-    embeddings, attention_mask = model.predict(feed_input)
-    expected_embeddings_shape = [3, seq_length, hidden_size]
-    expected_attention_mask_shape = [3, seq_length, seq_length]
-    self.assertAllEqual(expected_embeddings_shape, embeddings.shape)
-    self.assertAllEqual(expected_attention_mask_shape, attention_mask.shape)
-
-  def test_serialize_deserialize(self):
-    tf.keras.mixed_precision.set_global_policy('mixed_float16')
-    # Create a network object that sets all of its config options.
-    embedding_cfg = dict(
-        vocab_size=100,
-        type_vocab_size=2,
-        embedding_width=64,
-        hidden_size=64,
-        max_seq_length=32,
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        dropout_rate=0.1,
-        use_position_id=True,
-        pack_multiple_sequences=False,
-    )
-    network = packed_sequence_embedding.PackedSequenceEmbedding(**embedding_cfg)
-
-    expected_config = dict(embedding_cfg)
-    expected_config['initializer'] = tf.keras.initializers.serialize(
-        tf.keras.initializers.get(expected_config['initializer']))
-    self.assertEqual(network.get_config(), expected_config)
-
-    # Create another network object from the first object's config.
-    new_network = packed_sequence_embedding.PackedSequenceEmbedding.from_config(
-        network.get_config())
-
-    # Validate that the config can be forced to JSON.
-    _ = new_network.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(network.get_config(), new_network.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/span_labeling.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/span_labeling.py
deleted file mode 100644
index 74c66178275f4f03898c9334525212a6707861a5..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/span_labeling.py
+++ /dev/null
@@ -1,354 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Span labeling network."""
-# pylint: disable=g-classes-have-attributes
-import collections
-import tensorflow as tf
-
-
-def _apply_paragraph_mask(logits, paragraph_mask):
-  """Applies a position mask to calculated logits."""
-  masked_logits = logits * (paragraph_mask) - 1e30 * (1 - paragraph_mask)
-  return tf.nn.log_softmax(masked_logits, -1), masked_logits
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class SpanLabeling(tf.keras.Model):
-  """Span labeling network head for BERT modeling.
-
-  This network implements a simple single-span labeler based on a dense layer.
-  *Note* that the network is constructed by
-  [Keras Functional API](https://keras.io/guides/functional_api/).
-
-  Args:
-    input_width: The innermost dimension of the input tensor to this network.
-    activation: The activation, if any, for the dense layer in this network.
-    initializer: The initializer for the dense layer in this network. Defaults
-      to a Glorot uniform initializer.
-    output: The output style for this network. Can be either `logits` or
-      `predictions`.
-  """
-
-  def __init__(self,
-               input_width,
-               activation=None,
-               initializer='glorot_uniform',
-               output='logits',
-               **kwargs):
-
-    sequence_data = tf.keras.layers.Input(
-        shape=(None, input_width), name='sequence_data', dtype=tf.float32)
-
-    intermediate_logits = tf.keras.layers.Dense(
-        2,  # This layer predicts start location and end location.
-        activation=activation,
-        kernel_initializer=initializer,
-        name='predictions/transform/logits')(
-            sequence_data)
-    start_logits, end_logits = self._split_output_tensor(intermediate_logits)
-
-    start_predictions = tf.keras.layers.Activation(tf.nn.log_softmax)(
-        start_logits)
-    end_predictions = tf.keras.layers.Activation(tf.nn.log_softmax)(end_logits)
-
-    if output == 'logits':
-      output_tensors = [start_logits, end_logits]
-    elif output == 'predictions':
-      output_tensors = [start_predictions, end_predictions]
-    else:
-      raise ValueError(
-          ('Unknown `output` value "%s". `output` can be either "logits" or '
-           '"predictions"') % output)
-
-    # b/164516224
-    # Once we've created the network using the Functional API, we call
-    # super().__init__ as though we were invoking the Functional API Model
-    # constructor, resulting in this object having all the properties of a model
-    # created using the Functional API. Once super().__init__ is called, we
-    # can assign attributes to `self` - note that all `self` assignments are
-    # below this line.
-    super(SpanLabeling, self).__init__(
-        inputs=[sequence_data], outputs=output_tensors, **kwargs)
-    config_dict = {
-        'input_width': input_width,
-        'activation': activation,
-        'initializer': initializer,
-        'output': output,
-    }
-    # We are storing the config dict as a namedtuple here to ensure checkpoint
-    # compatibility with an earlier version of this model which did not track
-    # the config dict attribute. TF does not track immutable attrs which
-    # do not contain Trackables, so by creating a config namedtuple instead of
-    # a dict we avoid tracking it.
-    config_cls = collections.namedtuple('Config', config_dict.keys())
-    self._config = config_cls(**config_dict)
-    self.start_logits = start_logits
-    self.end_logits = end_logits
-
-  def _split_output_tensor(self, tensor):
-    transposed_tensor = tf.transpose(tensor, [2, 0, 1])
-    return tf.unstack(transposed_tensor)
-
-  def get_config(self):
-    return dict(self._config._asdict())
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-
-class XLNetSpanLabeling(tf.keras.layers.Layer):
-  """Span labeling network head for XLNet on SQuAD2.0.
-
-  This networks implements a span-labeler based on dense layers and question
-  possibility classification. This is the complex version seen in the original
-  XLNet implementation.
-
-  This applies a dense layer to the input sequence data to predict the start
-  positions, and then uses either the true start positions (if training) or
-  beam search to predict the end positions.
-
-  **Note: `compute_with_beam_search` will not work with the Functional API
-  (https://www.tensorflow.org/guide/keras/functional).
-
-  Args:
-    input_width: The innermost dimension of the input tensor to this network.
-    start_n_top: Beam size for span start.
-    end_n_top: Beam size for span end.
-    activation: The activation, if any, for the dense layer in this network.
-    dropout_rate: The dropout rate used for answer classification.
-    initializer: The initializer for the dense layer in this network. Defaults
-      to a Glorot uniform initializer.
-  """
-
-  def __init__(self,
-               input_width,
-               start_n_top=5,
-               end_n_top=5,
-               activation='tanh',
-               dropout_rate=0.,
-               initializer='glorot_uniform',
-               **kwargs):
-    super().__init__(**kwargs)
-    self._config = {
-        'input_width': input_width,
-        'activation': activation,
-        'initializer': initializer,
-        'start_n_top': start_n_top,
-        'end_n_top': end_n_top,
-        'dropout_rate': dropout_rate,
-    }
-    if start_n_top <= 1:
-      raise ValueError('`start_n_top` must be greater than 1.')
-    self._start_n_top = start_n_top
-    self._end_n_top = end_n_top
-    self.start_logits_dense = tf.keras.layers.Dense(
-        units=1,
-        kernel_initializer=initializer,
-        name='predictions/transform/start_logits')
-
-    self.end_logits_inner_dense = tf.keras.layers.Dense(
-        units=input_width,
-        kernel_initializer=initializer,
-        activation=activation,
-        name='predictions/transform/end_logits/inner')
-    self.end_logits_layer_norm = tf.keras.layers.LayerNormalization(
-        axis=-1, epsilon=1e-12,
-        name='predictions/transform/end_logits/layernorm')
-    self.end_logits_output_dense = tf.keras.layers.Dense(
-        units=1,
-        kernel_initializer=initializer,
-        name='predictions/transform/end_logits/output')
-
-    self.answer_logits_inner = tf.keras.layers.Dense(
-        units=input_width,
-        kernel_initializer=initializer,
-        activation=activation,
-        name='predictions/transform/answer_logits/inner')
-    self.answer_logits_dropout = tf.keras.layers.Dropout(rate=dropout_rate)
-    self.answer_logits_output = tf.keras.layers.Dense(
-        units=1,
-        kernel_initializer=initializer,
-        use_bias=False,
-        name='predictions/transform/answer_logits/output')
-
-  def end_logits(self, inputs):
-    """Computes the end logits.
-
-    Input shapes into the inner, layer norm, output layers should match.
-
-    During training, inputs shape should be
-    [batch_size, seq_length, input_width].
-
-    During inference, input shapes should be
-    [batch_size, seq_length, start_n_top, input_width].
-
-    Args:
-      inputs: The input for end logits.
-
-    Returns:
-      Calculated end logits.
-
-    """
-    if len(tf.shape(inputs)) == 3:
-      # inputs: [B, S, H] -> [B, S, 1, H]
-      inputs = tf.expand_dims(inputs, axis=2)
-
-    end_logits = self.end_logits_inner_dense(inputs)
-    end_logits = self.end_logits_layer_norm(end_logits)
-    end_logits = self.end_logits_output_dense(end_logits)
-    end_logits = tf.squeeze(end_logits)
-    return end_logits
-
-  def call(self,
-           sequence_data,
-           class_index,
-           paragraph_mask=None,
-           start_positions=None,
-           training=False):
-    """Implements call().
-
-    Einsum glossary:
-    - b: the batch size.
-    - l: the sequence length.
-    - h: the hidden size, or input width.
-    - k: the start/end top n.
-
-    Args:
-      sequence_data: The input sequence data of shape
-        `(batch_size, seq_length, input_width)`.
-      class_index: The class indices of the inputs of shape `(batch_size,)`.
-      paragraph_mask: Invalid position mask such as query and special symbols
-        (e.g. PAD, SEP, CLS) of shape `(batch_size,)`.
-      start_positions: The start positions of each example of shape
-        `(batch_size,)`.
-      training: Whether or not this is the training phase.
-
-    Returns:
-      A dictionary with the keys `start_predictions`, `end_predictions`,
-      `start_logits`, `end_logits`.
-
-      If inference, then `start_top_predictions`, `start_top_index`,
-      `end_top_predictions`, `end_top_index` are also included.
-
-    """
-    paragraph_mask = tf.cast(paragraph_mask, dtype=sequence_data.dtype)
-    class_index = tf.reshape(class_index, [-1])
-
-    seq_length = tf.shape(sequence_data)[1]
-    start_logits = self.start_logits_dense(sequence_data)
-    start_logits = tf.squeeze(start_logits, -1)
-    start_predictions, masked_start_logits = _apply_paragraph_mask(
-        start_logits, paragraph_mask)
-
-    compute_with_beam_search = not training or start_positions is None
-
-    if compute_with_beam_search:
-      # Compute end logits using beam search.
-      start_top_predictions, start_top_index = tf.nn.top_k(
-          start_predictions, k=self._start_n_top)
-      start_index = tf.one_hot(
-          start_top_index, depth=seq_length, axis=-1, dtype=tf.float32)
-      # start_index: [batch_size, end_n_top, seq_length]
-
-      start_features = tf.einsum('blh,bkl->bkh', sequence_data, start_index)
-      start_features = tf.tile(start_features[:, None, :, :],
-                               [1, seq_length, 1, 1])
-      # start_features: [batch_size, seq_length, end_n_top, input_width]
-
-      end_input = tf.tile(sequence_data[:, :, None],
-                          [1, 1, self._start_n_top, 1])
-      end_input = tf.concat([end_input, start_features], axis=-1)
-      # end_input: [batch_size, seq_length, end_n_top, 2*input_width]
-      paragraph_mask = paragraph_mask[:, None, :]
-      end_logits = self.end_logits(end_input)
-
-      # Note: this will fail if start_n_top is not >= 1.
-      end_logits = tf.transpose(end_logits, [0, 2, 1])
-    else:
-      start_positions = tf.reshape(start_positions, [-1])
-      start_index = tf.one_hot(
-          start_positions, depth=seq_length, axis=-1, dtype=tf.float32)
-      # start_index: [batch_size, seq_length]
-
-      start_features = tf.einsum('blh,bl->bh', sequence_data, start_index)
-      start_features = tf.tile(start_features[:, None, :], [1, seq_length, 1])
-      # start_features: [batch_size, seq_length, input_width]
-
-      end_input = tf.concat([sequence_data, start_features],
-                            axis=-1)
-      # end_input: [batch_size, seq_length, 2*input_width]
-      end_logits = self.end_logits(end_input)
-    end_predictions, masked_end_logits = _apply_paragraph_mask(
-        end_logits, paragraph_mask)
-
-    output_dict = dict(
-        start_predictions=start_predictions,
-        end_predictions=end_predictions,
-        start_logits=masked_start_logits,
-        end_logits=masked_end_logits)
-
-    if not training:
-      end_top_predictions, end_top_index = tf.nn.top_k(
-          end_predictions, k=self._end_n_top)
-      end_top_predictions = tf.reshape(
-          end_top_predictions,
-          [-1, self._start_n_top * self._end_n_top])
-      end_top_index = tf.reshape(
-          end_top_index,
-          [-1, self._start_n_top * self._end_n_top])
-      output_dict['start_top_predictions'] = start_top_predictions
-      output_dict['start_top_index'] = start_top_index
-      output_dict['end_top_predictions'] = end_top_predictions
-      output_dict['end_top_index'] = end_top_index
-
-    # get the representation of CLS
-    class_index = tf.one_hot(class_index, seq_length, axis=-1, dtype=tf.float32)
-    class_feature = tf.einsum('blh,bl->bh', sequence_data, class_index)
-
-    # get the representation of START
-    start_p = tf.nn.softmax(masked_start_logits, axis=-1)
-    start_feature = tf.einsum('blh,bl->bh', sequence_data, start_p)
-
-    answer_feature = tf.concat([start_feature, class_feature], -1)
-    answer_feature = self.answer_logits_inner(answer_feature)
-    answer_feature = self.answer_logits_dropout(answer_feature)
-    class_logits = self.answer_logits_output(answer_feature)
-    class_logits = tf.squeeze(class_logits, -1)
-    output_dict['class_logits'] = class_logits
-    return output_dict
-
-  def get_config(self):
-    return self._config
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/span_labeling_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/span_labeling_test.py
deleted file mode 100644
index a5ec3cab9939dedd987c4de9edda009b4ba478b4..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/span_labeling_test.py
+++ /dev/null
@@ -1,324 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for span_labeling network."""
-import numpy as np
-import tensorflow as tf
-
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.networks import span_labeling
-
-
-# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
-# guarantees forward compatibility of this code for the V2 switchover.
-@keras_parameterized.run_all_keras_modes
-class SpanLabelingTest(keras_parameterized.TestCase):
-
-  def test_network_creation(self):
-    """Validate that the Keras object can be created."""
-    sequence_length = 15
-    input_width = 512
-    test_network = span_labeling.SpanLabeling(
-        input_width=input_width, output='predictions')
-    # Create a 3-dimensional input (the first dimension is implicit).
-    sequence_data = tf.keras.Input(
-        shape=(sequence_length, input_width), dtype=tf.float32)
-    start_outputs, end_outputs = test_network(sequence_data)
-
-    # Validate that the outputs are of the expected shape.
-    expected_output_shape = [None, sequence_length]
-    self.assertEqual(expected_output_shape, start_outputs.shape.as_list())
-    self.assertEqual(expected_output_shape, end_outputs.shape.as_list())
-
-  def test_network_invocation(self):
-    """Validate that the Keras object can be invoked."""
-    sequence_length = 15
-    input_width = 512
-    test_network = span_labeling.SpanLabeling(input_width=input_width)
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    sequence_data = tf.keras.Input(
-        shape=(sequence_length, input_width), dtype=tf.float32)
-    outputs = test_network(sequence_data)
-    model = tf.keras.Model(sequence_data, outputs)
-
-    # Invoke the network as part of a Model.
-    batch_size = 3
-    input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, input_width))
-    start_outputs, end_outputs = model.predict(input_data)
-
-    # Validate that the outputs are of the expected shape.
-    expected_output_shape = (batch_size, sequence_length)
-    self.assertEqual(expected_output_shape, start_outputs.shape)
-    self.assertEqual(expected_output_shape, end_outputs.shape)
-
-  def test_network_invocation_with_internal_logit_output(self):
-    """Validate that the logit outputs are correct."""
-    sequence_length = 15
-    input_width = 512
-    test_network = span_labeling.SpanLabeling(
-        input_width=input_width, output='predictions')
-    # Create a 3-dimensional input (the first dimension is implicit).
-    sequence_data = tf.keras.Input(
-        shape=(sequence_length, input_width), dtype=tf.float32)
-    output = test_network(sequence_data)
-    model = tf.keras.Model(sequence_data, output)
-    logit_model = tf.keras.Model(
-        test_network.inputs,
-        [test_network.start_logits, test_network.end_logits])
-
-    batch_size = 3
-    input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, input_width))
-    start_outputs, end_outputs = model.predict(input_data)
-    start_logits, end_logits = logit_model.predict(input_data)
-
-    # Ensure that the tensor shapes are correct.
-    expected_output_shape = (batch_size, sequence_length)
-    self.assertEqual(expected_output_shape, start_outputs.shape)
-    self.assertEqual(expected_output_shape, end_outputs.shape)
-    self.assertEqual(expected_output_shape, start_logits.shape)
-    self.assertEqual(expected_output_shape, end_logits.shape)
-
-    # Ensure that the logits, when softmaxed, create the outputs.
-    input_tensor = tf.keras.Input(expected_output_shape[1:])
-    output_tensor = tf.keras.layers.Activation(tf.nn.log_softmax)(input_tensor)
-    softmax_model = tf.keras.Model(input_tensor, output_tensor)
-
-    start_softmax = softmax_model.predict(start_logits)
-    self.assertAllClose(start_outputs, start_softmax)
-    end_softmax = softmax_model.predict(end_logits)
-    self.assertAllClose(end_outputs, end_softmax)
-
-  def test_network_invocation_with_external_logit_output(self):
-    """Validate that the logit outputs are correct."""
-    sequence_length = 15
-    input_width = 512
-    test_network = span_labeling.SpanLabeling(
-        input_width=input_width, output='predictions')
-    logit_network = span_labeling.SpanLabeling(
-        input_width=input_width, output='logits')
-    logit_network.set_weights(test_network.get_weights())
-
-    # Create a 3-dimensional input (the first dimension is implicit).
-    sequence_data = tf.keras.Input(
-        shape=(sequence_length, input_width), dtype=tf.float32)
-    output = test_network(sequence_data)
-    logit_output = logit_network(sequence_data)
-    model = tf.keras.Model(sequence_data, output)
-    logit_model = tf.keras.Model(sequence_data, logit_output)
-
-    batch_size = 3
-    input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, input_width))
-    start_outputs, end_outputs = model.predict(input_data)
-    start_logits, end_logits = logit_model.predict(input_data)
-
-    # Ensure that the tensor shapes are correct.
-    expected_output_shape = (batch_size, sequence_length)
-    self.assertEqual(expected_output_shape, start_outputs.shape)
-    self.assertEqual(expected_output_shape, end_outputs.shape)
-    self.assertEqual(expected_output_shape, start_logits.shape)
-    self.assertEqual(expected_output_shape, end_logits.shape)
-
-    # Ensure that the logits, when softmaxed, create the outputs.
-    input_tensor = tf.keras.Input(expected_output_shape[1:])
-    output_tensor = tf.keras.layers.Activation(tf.nn.log_softmax)(input_tensor)
-    softmax_model = tf.keras.Model(input_tensor, output_tensor)
-
-    start_softmax = softmax_model.predict(start_logits)
-    self.assertAllClose(start_outputs, start_softmax)
-    end_softmax = softmax_model.predict(end_logits)
-    self.assertAllClose(end_outputs, end_softmax)
-
-  def test_serialize_deserialize(self):
-    # Create a network object that sets all of its config options.
-    network = span_labeling.SpanLabeling(
-        input_width=128,
-        activation='relu',
-        initializer='zeros',
-        output='predictions')
-
-    # Create another network object from the first object's config.
-    new_network = span_labeling.SpanLabeling.from_config(network.get_config())
-
-    # Validate that the config can be forced to JSON.
-    _ = new_network.to_json()
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(network.get_config(), new_network.get_config())
-
-  def test_unknown_output_type_fails(self):
-    with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
-      _ = span_labeling.SpanLabeling(input_width=10, output='bad')
-
-
-@keras_parameterized.run_all_keras_modes
-class XLNetSpanLabelingTest(keras_parameterized.TestCase):
-
-  def test_basic_invocation_train(self):
-    batch_size = 2
-    seq_length = 8
-    hidden_size = 4
-    sequence_data = np.random.uniform(
-        size=(batch_size, seq_length, hidden_size)).astype('float32')
-    paragraph_mask = np.random.uniform(
-        size=(batch_size, seq_length)).astype('float32')
-    class_index = np.random.uniform(size=(batch_size)).astype('uint8')
-    start_positions = np.zeros(shape=(batch_size)).astype('uint8')
-
-    layer = span_labeling.XLNetSpanLabeling(
-        input_width=hidden_size,
-        start_n_top=2,
-        end_n_top=2,
-        activation='tanh',
-        dropout_rate=0.,
-        initializer='glorot_uniform')
-    output = layer(sequence_data=sequence_data,
-                   class_index=class_index,
-                   paragraph_mask=paragraph_mask,
-                   start_positions=start_positions,
-                   training=True)
-
-    expected_keys = {
-        'start_logits', 'end_logits', 'class_logits', 'start_predictions',
-        'end_predictions',
-    }
-    self.assertSetEqual(expected_keys, set(output.keys()))
-
-  def test_basic_invocation_beam_search(self):
-    batch_size = 2
-    seq_length = 8
-    hidden_size = 4
-    top_n = 5
-    sequence_data = np.random.uniform(
-        size=(batch_size, seq_length, hidden_size)).astype('float32')
-    paragraph_mask = np.random.uniform(
-        size=(batch_size, seq_length)).astype('float32')
-    class_index = np.random.uniform(size=(batch_size)).astype('uint8')
-
-    layer = span_labeling.XLNetSpanLabeling(
-        input_width=hidden_size,
-        start_n_top=top_n,
-        end_n_top=top_n,
-        activation='tanh',
-        dropout_rate=0.,
-        initializer='glorot_uniform')
-    output = layer(sequence_data=sequence_data,
-                   class_index=class_index,
-                   paragraph_mask=paragraph_mask,
-                   training=False)
-    expected_keys = {
-        'start_top_predictions', 'end_top_predictions', 'class_logits',
-        'start_top_index', 'end_top_index', 'start_logits',
-        'end_logits', 'start_predictions', 'end_predictions'
-    }
-    self.assertSetEqual(expected_keys, set(output.keys()))
-
-  def test_subclass_invocation(self):
-    """Tests basic invocation of this layer wrapped in a subclass."""
-    seq_length = 8
-    hidden_size = 4
-    batch_size = 2
-
-    sequence_data = tf.keras.Input(shape=(seq_length, hidden_size),
-                                   dtype=tf.float32)
-    class_index = tf.keras.Input(shape=(), dtype=tf.uint8)
-    paragraph_mask = tf.keras.Input(shape=(seq_length), dtype=tf.float32)
-    start_positions = tf.keras.Input(shape=(), dtype=tf.int32)
-
-    layer = span_labeling.XLNetSpanLabeling(
-        input_width=hidden_size,
-        start_n_top=5,
-        end_n_top=5,
-        activation='tanh',
-        dropout_rate=0.,
-        initializer='glorot_uniform')
-
-    output = layer(sequence_data=sequence_data,
-                   class_index=class_index,
-                   paragraph_mask=paragraph_mask,
-                   start_positions=start_positions)
-    model = tf.keras.Model(
-        inputs={
-            'sequence_data': sequence_data,
-            'class_index': class_index,
-            'paragraph_mask': paragraph_mask,
-            'start_positions': start_positions,
-        },
-        outputs=output)
-
-    sequence_data = tf.random.uniform(
-        shape=(batch_size, seq_length, hidden_size), dtype=tf.float32)
-    paragraph_mask = tf.random.uniform(
-        shape=(batch_size, seq_length), dtype=tf.float32)
-    class_index = tf.ones(shape=(batch_size,), dtype=tf.uint8)
-    start_positions = tf.random.uniform(
-        shape=(batch_size,), maxval=5, dtype=tf.int32)
-
-    inputs = dict(sequence_data=sequence_data,
-                  paragraph_mask=paragraph_mask,
-                  class_index=class_index,
-                  start_positions=start_positions)
-
-    output = model(inputs)
-    self.assertIsInstance(output, dict)
-
-    # Test `call` without training flag.
-    output = model(inputs, training=False)
-    self.assertIsInstance(output, dict)
-
-    # Test `call` with training flag.
-    # Note: this fails due to incompatibility with the functional API.
-    with self.assertRaisesRegexp(AssertionError,
-                                 'Could not compute output KerasTensor'):
-      model(inputs, training=True)
-
-  def test_serialize_deserialize(self):
-    # Create a network object that sets all of its config options.
-    network = span_labeling.XLNetSpanLabeling(
-        input_width=128,
-        start_n_top=5,
-        end_n_top=1,
-        activation='tanh',
-        dropout_rate=0.34,
-        initializer='zeros')
-
-    # Create another network object from the first object's config.
-    new_network = span_labeling.XLNetSpanLabeling.from_config(
-        network.get_config())
-
-    # If the serialization was successful, the new config should match the old.
-    self.assertAllEqual(network.get_config(), new_network.get_config())
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/xlnet_base.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/xlnet_base.py
deleted file mode 100644
index 5dac0cd3f29d83944c8a51c4bc814ff7ed7a3291..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/xlnet_base.py
+++ /dev/null
@@ -1,725 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras-based XLNet Model."""
-
-from absl import logging
-
-import tensorflow as tf
-
-from official.nlp.modeling import layers
-from official.nlp.modeling.layers import transformer_xl
-
-_SEG_ID_CLS = 2
-
-
-def _create_causal_attention_mask(
-    seq_length,
-    memory_length,
-    dtype=tf.float32,
-    same_length=False):
-  """Creates a causal attention mask with a single-sided context.
-
-  When applying the attention mask in `MultiHeadRelativeAttention`, the
-  attention scores are of shape `[(batch dimensions), S, S + M]`, where:
-  - S = sequence length.
-  - M = memory length.
-
-  In a simple case where S = 2, M = 1, here is a simple illustration of the
-  `attention_scores` matrix, where `a` represents an attention function:
-
-   token_0   [[a(token_0, mem_0)    a(token_0, token_0)   a(token_0, token_1)],
-   token_1    [a(token_1, mem_0)    a(token_1, token_0)   a(token_1, token_1)]]
-                      mem_0                token_0               token_1
-
-  For uni-directional attention, we want to mask out values in the attention
-  scores that represent a(token_i, token_j) where j > i. We can achieve this by
-  concatenating 0s (representing memory positions) with a strictly upper
-  triangular matrix of 1s.
-
-  We then flip the matrix values in order to match the representation where
-  real values are 1s.
-
-  Args:
-    seq_length: int, The length of each sequence.
-    memory_length: int, The length of memory blocks.
-    dtype: dtype of the mask.
-    same_length: bool, whether to use the same attention length for each token.
-
-  Returns:
-    A unidirectional attention mask of shape
-    `[seq_length, seq_length + memory_length]`. E.g.:
-
-    [[1. 1. 1. 0. 0. 0.]
-     [1. 1. 1. 1. 0. 0.]
-     [1. 1. 1. 1. 1. 0.]
-     [1. 1. 1. 1. 1. 1.]]
-  """
-  ones_matrix = tf.ones([seq_length, seq_length], dtype=dtype)
-  upper_triangular = tf.linalg.band_part(ones_matrix, 0, -1)
-  diagonal = tf.linalg.band_part(ones_matrix, 0, 0)
-
-  padding = tf.zeros([seq_length, memory_length], dtype=dtype)
-  causal_attention_mask = tf.concat(
-      [padding, upper_triangular - diagonal], 1)
-  if same_length:
-    lower_triangular = tf.linalg.band_part(ones_matrix, -1, 0)
-    strictly_lower_triangular = lower_triangular - diagonal
-    causal_attention_mask = tf.concat(
-        [causal_attention_mask[:, :seq_length] + strictly_lower_triangular,
-         causal_attention_mask[:, seq_length:]], 1)
-
-  return 1 - causal_attention_mask
-
-
-def _combine_masks(mask1, mask2, dtype, how="and"):
-  """Combines two masks.
-
-  Use "and" if trying to combine two existing masks.
-  Use "or" if trying to flip a few positions to "real".
-
-  Args:
-    mask1: tf.Tensor, input mask 1
-    mask2: tf.Tensor, input mask 2
-    dtype: tf.dtype
-    how: Which logical operation should run.
-
-  Returns:
-    The combined input masks.
-
-  """
-  if how == "and":
-    operator = tf.math.logical_and
-  else:
-    operator = tf.math.logical_or
-  return tf.cast(operator(
-      tf.cast(mask1, tf.bool),
-      tf.cast(mask2, tf.bool)), dtype=dtype)
-
-
-def _compute_attention_mask(
-    input_mask,
-    permutation_mask,
-    attention_type,
-    seq_length,
-    memory_length,
-    batch_size,
-    dtype=tf.float32):
-  """Combines all input attention masks for XLNet.
-
-  In XLNet modeling, `0` represents tokens that can be attended, and `1`
-  represents tokens that cannot be attended.
-
-  For XLNet pre-training and fine tuning, there are a few masks used:
-  - Causal attention mask: If the attention type is unidirectional, then all
-    tokens after the current position cannot be attended to.
-  - Input mask: when generating data, padding is added to a max sequence length
-    to make all sequences the same length. This masks out real tokens (`0`) from
-    padding tokens (`1`).
-  - Permutation mask: during XLNet pretraining, the input sequence is factorized
-    into a factorization sequence `z`. During partial prediction, `z` is split
-    at a cutting point `c` (an index of the factorization sequence) and
-    prediction is only applied to all tokens after `c`. Therefore, tokens at
-    factorization positions `i` > `c` can be attended to and tokens at
-    factorization positions `i` <= `c` cannot be attended to.
-
-  This function broadcasts and combines all attention masks to produce the
-  query attention mask and the content attention mask.
-
-  Args:
-    input_mask: Tensor, the input mask related to padding. Input shape:
-      `(B, S)`.
-    permutation_mask: Tensor, the permutation mask used in partial prediction.
-      Input shape: `(B, S, S)`.
-    attention_type: str, the attention type. Can be "uni" (directional) or
-      "bi" (directional).
-    seq_length: int, the length of each sequence.
-    memory_length: int the length of memory blocks.
-    batch_size: int, the batch size.
-    dtype: The dtype of the masks.
-
-  Returns:
-    attention_mask, content_attention_mask: The position and context-based
-      attention masks and content attention masks, respectively.
-
-  """
-  attention_mask = None
-  # `1` values mean do not attend to this position.
-  if attention_type == "uni":
-    causal_attention_mask = _create_causal_attention_mask(
-        seq_length=seq_length,
-        memory_length=memory_length,
-        dtype=dtype)
-    causal_attention_mask = causal_attention_mask[None, None, :, :]
-    # `causal_attention_mask`: [1, 1, S, S + M]
-
-  # input_mask: [B, S]
-  # permutation_mask: [B, S, S]
-  if input_mask is not None and permutation_mask is not None:
-    data_mask = _combine_masks(input_mask[:, None, :], permutation_mask, dtype)
-  elif input_mask is not None and permutation_mask is None:
-    data_mask = input_mask[:, None, :]
-  elif input_mask is None and permutation_mask is not None:
-    data_mask = permutation_mask
-  else:
-    data_mask = None
-
-  # data_mask: [B, S, S] or [B, 1, S]
-
-  if data_mask is not None:
-    # All positions within state can be attended to.
-    state_mask = tf.ones([batch_size, tf.shape(data_mask)[1], memory_length],
-                         dtype=dtype)
-    # state_mask: [B, 1, M] or [B, S, M]
-    data_mask = tf.concat([state_mask, data_mask], 2)
-    # data_mask: [B, 1, S + M] or [B, S, S + M]
-
-    if attention_type == "uni":
-      attention_mask = _combine_masks(causal_attention_mask,
-                                      data_mask[:, None, :, :],
-                                      dtype=dtype)
-    else:
-      attention_mask = data_mask[:, None, :, :]
-
-  if attention_mask is not None:
-    # Construct the content attention mask.
-    # This ensures that the mask allows the model to attend to positions in
-    # content positions (e.g. the content diagonal).
-    non_target_mask = tf.concat(
-        [tf.zeros([seq_length, memory_length], dtype=dtype),
-         tf.eye(seq_length, dtype=dtype)], axis=-1)
-    content_attention_mask = _combine_masks(
-        attention_mask, non_target_mask, how="or", dtype=dtype)
-  else:
-    content_attention_mask = None
-
-  return attention_mask, content_attention_mask
-
-
-def _compute_segment_matrix(
-    segment_ids,
-    memory_length,
-    batch_size,
-    use_cls_mask):
-  """Computes the segment embedding matrix.
-
-  XLNet introduced segment-based attention for attention calculations. This
-  extends the idea of relative encodings in Transformer XL by considering
-  whether or not two positions are within the same segment, rather than
-  which segments they come from.
-
-  This function generates a segment matrix by broadcasting provided segment IDs
-  in two different dimensions and checking where values are equal. This output
-  matrix shows `True` whenever two tokens are NOT in the same segment and
-  `False` whenever they are.
-
-  Args:
-    segment_ids: A Tensor of size `[B, S]` that represents which segment
-      each token belongs to.
-    memory_length: int, the length of memory blocks.
-    batch_size: int, the batch size.
-    use_cls_mask: bool, whether or not to introduce cls mask in
-      input sequences.
-
-  Returns:
-    A boolean Tensor of size `[B, S, S + M]`, where `True` means that two
-    tokens are NOT in the same segment, and `False` means they are in the same
-    segment.
-
-  """
-  if segment_ids is None:
-    return None
-
-  memory_padding = tf.zeros([batch_size, memory_length],
-                            dtype=segment_ids.dtype)
-  padded_segment_ids = tf.concat([memory_padding, segment_ids], 1)
-  # segment_ids: [B, S]
-  # padded_segment_ids: [B, S + M]
-
-  if use_cls_mask:
-    # `1` indicates not in the same segment.
-    # Target result: [B, S, S + M]
-
-    # segment_ids: [B, S]
-    # padded_segment_ids: [B, S + M]
-    broadcasted_segment_class_indices = (
-        tf.equal(segment_ids,
-                 tf.constant([_SEG_ID_CLS]))[:, :, None])
-
-    broadcasted_padded_class_indices = (
-        tf.equal(
-            padded_segment_ids,
-            tf.constant([_SEG_ID_CLS]))[:, None, :])
-
-    class_index_matrix = tf.logical_or(broadcasted_segment_class_indices,
-                                       broadcasted_padded_class_indices)
-
-    segment_matrix = tf.equal(segment_ids[:, :, None],
-                              padded_segment_ids[:, None, :])
-    segment_matrix = tf.logical_or(class_index_matrix, segment_matrix)
-  else:
-    # TODO(allencwang) - address this legacy mismatch from `use_cls_mask`.
-    segment_matrix = tf.logical_not(
-        tf.equal(segment_ids[:, :, None], padded_segment_ids[:, None, :]))
-  return segment_matrix
-
-
-def _compute_positional_encoding(
-    attention_type,
-    position_encoding_layer,
-    hidden_size,
-    batch_size,
-    total_length,
-    seq_length,
-    clamp_length,
-    bi_data,
-    dtype=tf.float32):
-  """Computes the relative position encoding.
-
-  Args:
-    attention_type: str, the attention type. Can be "uni" (directional) or
-      "bi" (directional).
-    position_encoding_layer: An instance of `RelativePositionEncoding`.
-    hidden_size: int, the hidden size.
-    batch_size: int, the batch size.
-    total_length: int, the sequence length added to the memory length.
-    seq_length: int, the length of each sequence.
-    clamp_length: int, clamp all relative distances larger than clamp_length. -1
-      means no clamping.
-    bi_data: bool, whether to use bidirectional input pipeline. Usually set to
-      True during pretraining and False during finetuning.
-    dtype: the dtype of the encoding.
-
-  Returns:
-    A Tensor, representing the position encoding.
-
-  """
-  freq_seq = tf.range(0, hidden_size, 2.0)
-  if dtype is not None and dtype != tf.float32:
-    freq_seq = tf.cast(freq_seq, dtype=dtype)
-
-  if attention_type == "bi":
-    beg, end = total_length, -seq_length
-  elif attention_type == "uni":
-    beg, end = total_length, -1
-  else:
-    raise ValueError("Unknown `attention_type` {}.".format(attention_type))
-
-  if bi_data:
-    forward_position_sequence = tf.range(beg, end, -1.0)
-    backward_position_sequence = tf.range(-beg, -end, 1.0)
-
-    if dtype is not None and dtype != tf.float32:
-      forward_position_sequence = tf.cast(forward_position_sequence,
-                                          dtype=dtype)
-      backward_position_sequence = tf.cast(backward_position_sequence,
-                                           dtype=dtype)
-
-    if clamp_length > 0:
-      forward_position_sequence = tf.clip_by_value(
-          forward_position_sequence,
-          -clamp_length,
-          clamp_length)
-      backward_position_sequence = tf.clip_by_value(
-          backward_position_sequence,
-          -clamp_length,
-          clamp_length)
-
-    if batch_size is not None:
-      forward_positional_encoding = position_encoding_layer(
-          forward_position_sequence, batch_size // 2)
-      backward_positional_encoding = position_encoding_layer(
-          backward_position_sequence, batch_size // 2)
-    else:
-      forward_positional_encoding = position_encoding_layer(
-          forward_position_sequence, None)
-      backward_positional_encoding = position_encoding_layer(
-          backward_position_sequence, None)
-
-    relative_position_encoding = tf.concat(
-        [forward_positional_encoding, backward_positional_encoding], axis=0)
-  else:
-    forward_position_sequence = tf.range(beg, end, -1.0)
-    if dtype is not None and dtype != tf.float32:
-      forward_position_sequence = tf.cast(
-          forward_position_sequence, dtype=dtype)
-    if clamp_length > 0:
-      forward_position_sequence = tf.clip_by_value(
-          forward_position_sequence,
-          -clamp_length,
-          clamp_length)
-
-    relative_position_encoding = position_encoding_layer(
-        forward_position_sequence, batch_size)
-  return relative_position_encoding
-
-
-class RelativePositionEncoding(tf.keras.layers.Layer):
-  """Creates a relative positional encoding.
-
-  This layer creates a relative positional encoding as described in
-  "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
-  (https://arxiv.org/abs/1901.02860).
-
-  Rather than an absolute position embedding as in Transformer, this
-  formulation represents position as the relative distance between tokens using
-  sinusoidal positional embeddings.
-
-  Note: This layer is currently experimental.
-
-  Attributes:
-    hidden_size: The dimensionality of the input embeddings.
-  """
-
-  def __init__(self, hidden_size, **kwargs):
-    super(RelativePositionEncoding, self).__init__(**kwargs)
-    self._hidden_size = hidden_size
-    self._inv_freq = 1.0 / (10000.0**(
-        tf.range(0, self._hidden_size, 2.0) / self._hidden_size))
-
-  def call(self, pos_seq, batch_size=None):
-    """Implements call() for the layer.
-
-    Args:
-      pos_seq: A 1-D `Tensor`
-      batch_size: The optionally provided batch size that tiles the relative
-        positional encoding.
-
-    Returns:
-      The relative positional encoding of shape:
-        [batch_size, len(pos_seq), hidden_size] if batch_size is provided, else
-        [1, len(pos_seq), hidden_size].
-    """
-    sinusoid_input = tf.einsum("i,d->id", pos_seq, self._inv_freq)
-    relative_position_encoding = tf.concat([tf.sin(sinusoid_input),
-                                            tf.cos(sinusoid_input)], -1)
-    relative_position_encoding = relative_position_encoding[None, :, :]
-    if batch_size is not None:
-      relative_position_encoding = tf.tile(relative_position_encoding,
-                                           [batch_size, 1, 1])
-    return relative_position_encoding
-
-
-@tf.keras.utils.register_keras_serializable(package="Text")
-class XLNetBase(tf.keras.layers.Layer):
-  """Base XLNet model.
-
-  Attributes:
-    vocab_size: int, the number of tokens in vocabulary.
-    num_layers: int, the number of layers.
-    hidden_size: int, the hidden size.
-    num_attention_heads: int, the number of attention heads.
-    head_size: int, the dimension size of each attention head.
-    inner_size: int, the hidden size in feed-forward layers.
-    dropout_rate: float, dropout rate.
-    attention_dropout_rate: float, dropout rate on attention probabilities.
-    attention_type: str, "uni" or "bi".
-    bi_data: bool, whether to use bidirectional input pipeline. Usually set to
-      True during pretraining and False during finetuning.
-    initializer: A tf initializer.
-    two_stream: bool, whether or not to use `TwoStreamRelativeAttention` used
-      in the XLNet pretrainer. If `False`, then it will use
-      `MultiHeadRelativeAttention` as in Transformer XL.
-    tie_attention_biases: bool, whether or not to tie the biases together.
-      Usually set to `True`. Used for backwards compatibility.
-    memory_length: int, the number of tokens to cache.
-    same_length: bool, whether to use the same attention length for each
-      token.
-    clamp_length: int, clamp all relative distances larger than clamp_length. -1
-      means no clamping.
-    reuse_length: int, the number of tokens in the currect batch to be cached
-      and reused in the future.
-    inner_activation: str, "relu" or "gelu".
-    use_cls_mask: bool, whether or not cls mask is included in the
-      input sequences.
-    embedding_width: The width of the word embeddings. If the embedding width
-      is not equal to hidden size, embedding parameters will be factorized
-      into two matrices in the shape of ["vocab_size", "embedding_width"] and
-      ["embedding_width", "hidden_size"] ("embedding_width" is usually much
-      smaller than "hidden_size").
-    embedding_layer: The word embedding layer. `None` means we will create a
-      new embedding layer. Otherwise, we will reuse the given embedding layer.
-      This parameter is originally added for ELECTRA model which needs to tie
-      the generator embeddings with the discriminator embeddings.
-  """
-
-  def __init__(self,
-               vocab_size,
-               num_layers,
-               hidden_size,
-               num_attention_heads,
-               head_size,
-               inner_size,
-               dropout_rate,
-               attention_dropout_rate,
-               attention_type,
-               bi_data,
-               initializer,
-               two_stream=False,
-               tie_attention_biases=True,
-               memory_length=None,
-               clamp_length=-1,
-               reuse_length=None,
-               inner_activation="relu",
-               use_cls_mask=False,
-               embedding_width=None,
-               **kwargs):
-    super(XLNetBase, self).__init__(**kwargs)
-
-    self._vocab_size = vocab_size
-    self._initializer = initializer
-    self._attention_type = attention_type
-    self._num_layers = num_layers
-    self._hidden_size = hidden_size
-    self._num_attention_heads = num_attention_heads
-    self._head_size = head_size
-    self._inner_size = inner_size
-    self._inner_activation = inner_activation
-    self._dropout_rate = dropout_rate
-    self._attention_dropout_rate = attention_dropout_rate
-    self._tie_attention_biases = tie_attention_biases
-    self._two_stream = two_stream
-
-    self._memory_length = memory_length
-    self._reuse_length = reuse_length
-    self._bi_data = bi_data
-    self._clamp_length = clamp_length
-    self._use_cls_mask = use_cls_mask
-
-    self._segment_embedding = None
-    self._mask_embedding = None
-    self._embedding_width = embedding_width
-
-    if embedding_width is None:
-      embedding_width = hidden_size
-
-    self._embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=self._vocab_size,
-        embedding_width=embedding_width,
-        initializer=self._initializer,
-        dtype=tf.float32,
-        name="word_embedding")
-    self._dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-
-    self.embedding_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-    self.position_encoding = RelativePositionEncoding(self._hidden_size)
-
-    self._transformer_xl = transformer_xl.TransformerXL(
-        vocab_size=vocab_size,
-        num_layers=num_layers,
-        hidden_size=hidden_size,
-        num_attention_heads=num_attention_heads,
-        head_size=head_size,
-        inner_size=inner_size,
-        dropout_rate=dropout_rate,
-        attention_dropout_rate=attention_dropout_rate,
-        initializer=initializer,
-        two_stream=two_stream,
-        tie_attention_biases=tie_attention_biases,
-        memory_length=memory_length,
-        reuse_length=reuse_length,
-        inner_activation=inner_activation,
-        name="transformer_xl")
-
-  def get_config(self):
-    config = {
-        "vocab_size":
-            self._vocab_size,
-        "num_layers":
-            self._num_layers,
-        "hidden_size":
-            self._hidden_size,
-        "num_attention_heads":
-            self._num_attention_heads,
-        "head_size":
-            self._head_size,
-        "inner_size":
-            self._inner_size,
-        "dropout_rate":
-            self._dropout_rate,
-        "attention_dropout_rate":
-            self._attention_dropout_rate,
-        "attention_type":
-            self._attention_type,
-        "bi_data":
-            self._bi_data,
-        "initializer":
-            self._initializer,
-        "two_stream":
-            self._two_stream,
-        "tie_attention_biases":
-            self._tie_attention_biases,
-        "memory_length":
-            self._memory_length,
-        "clamp_length":
-            self._clamp_length,
-        "reuse_length":
-            self._reuse_length,
-        "inner_activation":
-            self._inner_activation,
-        "use_cls_mask":
-            self._use_cls_mask,
-        "embedding_width":
-            self._embedding_width,
-    }
-    base_config = super(XLNetBase, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  def get_embedding_lookup_table(self):
-    """Returns the embedding layer weights."""
-    return self._embedding_layer.embeddings
-
-  def __call__(self,
-               input_ids,
-               segment_ids=None,
-               input_mask=None,
-               state=None,
-               permutation_mask=None,
-               target_mapping=None,
-               masked_tokens=None,
-               **kwargs):
-    # Uses dict to feed inputs into call() in order to keep state as a python
-    # list.
-    inputs = {
-        "input_ids": input_ids,
-        "segment_ids": segment_ids,
-        "input_mask": input_mask,
-        "state": state,
-        "permutation_mask": permutation_mask,
-        "target_mapping": target_mapping,
-        "masked_tokens": masked_tokens
-    }
-    return super(XLNetBase, self).__call__(inputs, **kwargs)
-
-  def call(self, inputs):
-    """Implements call() for the layer."""
-    input_ids = inputs["input_ids"]
-    segment_ids = inputs["segment_ids"]
-    input_mask = inputs["input_mask"]
-    state = inputs["state"]
-    permutation_mask = inputs["permutation_mask"]
-    target_mapping = inputs["target_mapping"]
-    masked_tokens = inputs["masked_tokens"]
-
-    batch_size = tf.shape(input_ids)[0]
-    seq_length = tf.shape(input_ids)[1]
-    if state is not None:
-      memory_length = tf.shape(state[0])[1]
-    else:
-      memory_length = 0
-    total_length = memory_length + seq_length
-
-    if self._two_stream and masked_tokens is None:
-      raise ValueError("`masked_tokens` must be provided in order to "
-                       "initialize the query stream in "
-                       "`TwoStreamRelativeAttention`.")
-    if masked_tokens is not None and not self._two_stream:
-      logging.warning("`masked_tokens` is provided but `two_stream` is not "
-                      "enabled. Please enable `two_stream` to enable two "
-                      "stream attention.")
-
-    if input_mask is not None:
-      dtype = input_mask.dtype
-    elif permutation_mask is not None:
-      dtype = permutation_mask.dtype
-    else:
-      dtype = tf.int32
-    query_attention_mask, content_attention_mask = _compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type=self._attention_type,
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=dtype)
-    relative_position_encoding = _compute_positional_encoding(
-        attention_type=self._attention_type,
-        position_encoding_layer=self.position_encoding,
-        hidden_size=self._hidden_size,
-        batch_size=batch_size,
-        total_length=total_length,
-        seq_length=seq_length,
-        clamp_length=self._clamp_length,
-        bi_data=self._bi_data,
-        dtype=tf.float32)
-    relative_position_encoding = self.embedding_dropout(
-        relative_position_encoding)
-
-    if segment_ids is None:
-      segment_embedding = None
-      segment_matrix = None
-    else:
-      if self._segment_embedding is None:
-        self._segment_embedding = self.add_weight(
-            "seg_embed",
-            shape=[self._num_layers, 2, self._num_attention_heads,
-                   self._head_size],
-            dtype=tf.float32,
-            initializer=self._initializer)
-
-      segment_embedding = self._segment_embedding
-      segment_matrix = _compute_segment_matrix(
-          segment_ids=segment_ids,
-          memory_length=memory_length,
-          batch_size=batch_size,
-          use_cls_mask=self._use_cls_mask)
-
-    word_embeddings = self._embedding_layer(input_ids)
-    content_stream = self._dropout(word_embeddings)
-
-    if self._two_stream:
-      if self._mask_embedding is None:
-        self._mask_embedding = self.add_weight(
-            "mask_emb/mask_emb",
-            shape=[1, 1, self._hidden_size],
-            dtype=tf.float32)
-      if target_mapping is None:
-        masked_tokens = masked_tokens[:, :, None]
-        masked_token_embedding = (
-            masked_tokens * self._mask_embedding +
-            (1 - masked_tokens) * word_embeddings)
-      else:
-        masked_token_embedding = tf.tile(
-            self._mask_embedding,
-            [batch_size, tf.shape(target_mapping)[1], 1])
-      query_stream = self._dropout(masked_token_embedding)
-    else:
-      query_stream = None
-
-    return self._transformer_xl(
-        content_stream=content_stream,
-        query_stream=query_stream,
-        target_mapping=target_mapping,
-        state=state,
-        relative_position_encoding=relative_position_encoding,
-        segment_matrix=segment_matrix,
-        segment_embedding=segment_embedding,
-        content_attention_mask=content_attention_mask,
-        query_attention_mask=query_attention_mask)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/xlnet_base_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/xlnet_base_test.py
deleted file mode 100644
index 99d8da024ee3048040e83c1103f74479b9c6e1e2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/networks/xlnet_base_test.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for Keras based XLNet model."""
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.networks import xlnet_base
-
-
-@keras_parameterized.run_all_keras_modes
-class RelativePositionEncodingTest(keras_parameterized.TestCase):
-
-  def test_positional_embedding(self):
-    """A low-dimensional example is tested.
-
-     With len(pos_seq)=2 and d_model=4:
-
-       pos_seq  = [[1.], [0.]]
-       inv_freq = [1., 0.01]
-       pos_seq x inv_freq = [[1, 0.01], [0., 0.]]
-       pos_emb = [[sin(1.), sin(0.01), cos(1.), cos(0.01)],
-                  [sin(0.), sin(0.), cos(0.), cos(0.)]]
-               = [[0.84147096, 0.00999983, 0.54030228, 0.99994999],
-                 [0., 0., 1., 1.]]
-    """
-    target = np.array([[[0.84147096, 0.00999983, 0.54030228, 0.99994999],
-                        [0., 0., 1., 1.]]])
-    hidden_size = 4
-    pos_seq = tf.range(1, -1, -1.0)  # [1., 0.]
-    encoding_layer = xlnet_base.RelativePositionEncoding(
-        hidden_size=hidden_size)
-    encoding = encoding_layer(pos_seq, batch_size=None).numpy().astype(float)
-    self.assertAllClose(encoding, target)
-
-
-class ComputePositionEncodingTest(keras_parameterized.TestCase):
-
-  @combinations.generate(combinations.combine(
-      attention_type=["uni", "bi"],
-      bi_data=[False, True],
-      ))
-  def test_compute_position_encoding_smoke(self, attention_type, bi_data):
-    hidden_size = 4
-    batch_size = 4
-    total_length = 8
-    seq_length = 4
-    position_encoding_layer = xlnet_base.RelativePositionEncoding(
-        hidden_size=hidden_size)
-    encoding = xlnet_base._compute_positional_encoding(
-        attention_type=attention_type,
-        position_encoding_layer=position_encoding_layer,
-        hidden_size=hidden_size,
-        batch_size=batch_size,
-        total_length=total_length,
-        seq_length=seq_length,
-        clamp_length=2,
-        bi_data=bi_data,
-        dtype=tf.float32)
-    self.assertEqual(encoding.shape[0], batch_size)
-    self.assertEqual(encoding.shape[2], hidden_size)
-
-
-class CausalAttentionMaskTests(tf.test.TestCase):
-
-  def test_casual_attention_mask_with_no_memory(self):
-    seq_length, memory_length = 3, 0
-    causal_attention_mask = xlnet_base._create_causal_attention_mask(
-        seq_length=seq_length,
-        memory_length=memory_length)
-
-    expected_output = np.array([[1, 0, 0],
-                                [1, 1, 0],
-                                [1, 1, 1]])
-    self.assertAllClose(causal_attention_mask, expected_output)
-
-  def test_casual_attention_mask_with_memory(self):
-    seq_length, memory_length = 3, 2
-    causal_attention_mask = xlnet_base._create_causal_attention_mask(
-        seq_length=seq_length,
-        memory_length=memory_length)
-
-    expected_output = np.array([[1, 1, 1, 0, 0],
-                                [1, 1, 1, 1, 0],
-                                [1, 1, 1, 1, 1]])
-    self.assertAllClose(causal_attention_mask, expected_output)
-
-  def test_causal_attention_mask_with_same_length(self):
-    seq_length, memory_length = 3, 2
-    causal_attention_mask = xlnet_base._create_causal_attention_mask(
-        seq_length=seq_length,
-        memory_length=memory_length,
-        same_length=True)
-
-    expected_output = np.array([[1, 1, 1, 0, 0],
-                                [0, 1, 1, 1, 0],
-                                [0, 0, 1, 1, 1]])
-    self.assertAllClose(causal_attention_mask, expected_output)
-
-
-class MaskComputationTests(keras_parameterized.TestCase):
-
-  @combinations.generate(combinations.combine(
-      use_input_mask=[False, True],
-      use_permutation_mask=[False, True],
-      attention_type=["uni", "bi"],
-      memory_length=[0, 4],
-      ))
-  def test_compute_attention_mask_smoke(self,
-                                        use_input_mask,
-                                        use_permutation_mask,
-                                        attention_type,
-                                        memory_length):
-    """Tests coverage and functionality for different configurations."""
-    batch_size = 2
-    seq_length = 8
-    if use_input_mask:
-      input_mask = tf.zeros(shape=(batch_size, seq_length))
-    else:
-      input_mask = None
-    if use_permutation_mask:
-      permutation_mask = tf.zeros(shape=(batch_size, seq_length, seq_length))
-    else:
-      permutation_mask = None
-    _, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type=attention_type,
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=tf.float32)
-
-    expected_mask_shape = (batch_size, 1,
-                           seq_length, seq_length + memory_length)
-    if use_input_mask or use_permutation_mask:
-      self.assertEqual(content_mask.shape, expected_mask_shape)
-
-  def test_no_input_masks(self):
-    query_mask, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=None,
-        permutation_mask=None,
-        attention_type="uni",
-        seq_length=8,
-        memory_length=2,
-        batch_size=2,
-        dtype=tf.float32)
-    self.assertIsNone(query_mask)
-    self.assertIsNone(content_mask)
-
-  def test_input_mask_no_permutation(self):
-    """Tests if an input mask is provided but not permutation.
-
-    In the case that only one of input mask or permutation mask is provided
-    and the attention type is bidirectional, the query mask should be
-    a broadcasted version of the provided mask.
-
-    Content mask should be a broadcasted version of the query mask, where the
-    diagonal is 0s.
-
-    """
-    seq_length = 4
-    batch_size = 1
-    memory_length = 0
-
-    input_mask = np.array([[1, 1, 0, 0]])
-    permutation_mask = None
-
-    expected_query_mask = input_mask[None, None, :, :]
-    expected_content_mask = np.array([[[
-        [1, 1, 0, 0],
-        [1, 1, 0, 0],
-        [1, 1, 1, 0],
-        [1, 1, 0, 1]]]])
-
-    query_mask, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type="bi",
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=tf.float32)
-
-    self.assertAllClose(query_mask, expected_query_mask)
-    self.assertAllClose(content_mask, expected_content_mask)
-
-  def test_permutation_mask_no_input_mask(self):
-    """Tests if a permutation mask is provided but not input."""
-    seq_length = 2
-    batch_size = 1
-    memory_length = 0
-
-    input_mask = None
-    permutation_mask = np.array([
-        [[1, 0],
-         [1, 0]],
-    ])
-
-    expected_query_mask = permutation_mask[:, None, :, :]
-    expected_content_mask = np.array([[[
-        [1, 0],
-        [1, 1]]]])
-
-    query_mask, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type="bi",
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=tf.float32)
-
-    self.assertAllClose(query_mask, expected_query_mask)
-    self.assertAllClose(content_mask, expected_content_mask)
-
-  def test_permutation_and_input_mask(self):
-    """Tests if both an input and permutation mask are provided."""
-    seq_length = 4
-    batch_size = 1
-    memory_length = 0
-
-    input_mask = np.array([[1, 1, 0, 0]])
-    permutation_mask = np.array([[
-        [0, 1, 1, 1],
-        [1, 0, 1, 1],
-        [1, 1, 0, 1],
-        [1, 1, 1, 0],
-    ]])
-
-    expected_query_mask = np.array([[[
-        [0, 1, 0, 0],
-        [1, 0, 0, 0],
-        [1, 1, 0, 0],
-        [1, 1, 0, 0]]]])
-    expected_content_mask = np.array([[[
-        [1, 1, 0, 0],
-        [1, 1, 0, 0],
-        [1, 1, 1, 0],
-        [1, 1, 0, 1]]]])
-    query_mask, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type="bi",
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=tf.float32)
-
-    self.assertAllClose(query_mask, expected_query_mask)
-    self.assertAllClose(content_mask, expected_content_mask)
-
-  def test_permutation_input_uni_mask(self):
-    """Tests if an input, permutation and causal mask are provided."""
-    seq_length = 4
-    batch_size = 1
-    memory_length = 0
-
-    input_mask = np.array([[1, 1, 1, 0]])
-    permutation_mask = np.array([[
-        [0, 1, 1, 1],
-        [1, 0, 1, 1],
-        [1, 1, 0, 1],
-        [1, 1, 1, 0],
-    ]])
-
-    expected_query_mask = np.array([[[
-        [0, 0, 0, 0],
-        [1, 0, 0, 0],
-        [1, 1, 0, 0],
-        [1, 1, 1, 0]]]])
-    expected_content_mask = np.array([[[
-        [1, 0, 0, 0],
-        [1, 1, 0, 0],
-        [1, 1, 1, 0],
-        [1, 1, 1, 1]]]])
-    query_mask, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type="uni",
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=tf.float32)
-
-    self.assertAllClose(query_mask, expected_query_mask)
-    self.assertAllClose(content_mask, expected_content_mask)
-
-
-class SegmentMatrixTests(tf.test.TestCase):
-
-  def test_no_segment_ids(self):
-    segment_matrix = xlnet_base._compute_segment_matrix(
-        segment_ids=None,
-        memory_length=2,
-        batch_size=1,
-        use_cls_mask=False)
-    self.assertIsNone(segment_matrix)
-
-  def test_basic(self):
-    batch_size = 1
-    memory_length = 0
-    segment_ids = np.array([
-        [1, 1, 2, 1]
-    ])
-    expected_segment_matrix = np.array([[
-        [False, False, True, False],
-        [False, False, True, False],
-        [True, True, False, True],
-        [False, False, True, False]
-    ]])
-    segment_matrix = xlnet_base._compute_segment_matrix(
-        segment_ids=segment_ids,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        use_cls_mask=False)
-    self.assertAllClose(segment_matrix, expected_segment_matrix)
-
-  def test_basic_with_memory(self):
-    batch_size = 1
-    memory_length = 1
-    segment_ids = np.array([
-        [1, 1, 2, 1]
-    ])
-    expected_segment_matrix = np.array([[
-        [True, False, False, True, False],
-        [True, False, False, True, False],
-        [True, True, True, False, True],
-        [True, False, False, True, False]
-    ]]).astype(int)
-    segment_matrix = tf.cast(xlnet_base._compute_segment_matrix(
-        segment_ids=segment_ids,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        use_cls_mask=False), dtype=tf.uint8)
-    self.assertAllClose(segment_matrix, expected_segment_matrix)
-
-  def dont_test_basic_with_class_mask(self):
-    # TODO(allencwang) - this test should pass but illustrates the legacy issue
-    # of using class mask. Enable once addressed.
-    batch_size = 1
-    memory_length = 0
-    segment_ids = np.array([
-        [1, 1, 2, 1]
-    ])
-    expected_segment_matrix = np.array([[
-        [False, False, True, False],
-        [False, False, True, False],
-        [True, True, False, True],
-        [False, False, True, False]
-    ]]).astype(int)
-    segment_matrix = tf.cast(xlnet_base._compute_segment_matrix(
-        segment_ids=segment_ids,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        use_cls_mask=True), dtype=tf.uint8)
-    self.assertAllClose(segment_matrix, expected_segment_matrix)
-
-
-class XLNetModelTests(tf.test.TestCase):
-
-  def _generate_data(self,
-                     batch_size,
-                     seq_length,
-                     num_predictions=None):
-    """Generates sample XLNet data for testing."""
-    sequence_shape = (batch_size, seq_length)
-    if num_predictions is not None:
-      target_mapping = tf.random.uniform(
-          shape=(batch_size, num_predictions, seq_length))
-
-    return {
-        "input_ids": np.random.randint(10, size=sequence_shape, dtype="int32"),
-        "segment_ids":
-            np.random.randint(2, size=sequence_shape, dtype="int32"),
-        "input_mask":
-            np.random.randint(2, size=sequence_shape).astype("float32"),
-        "permutation_mask":
-            np.random.randint(
-                2, size=(batch_size, seq_length, seq_length)).astype("float32"),
-        "target_mapping": target_mapping,
-        "masked_tokens": tf.random.uniform(shape=sequence_shape),
-    }
-
-  def test_xlnet_model(self):
-    batch_size = 2
-    seq_length = 8
-    num_predictions = 2
-    hidden_size = 4
-    xlnet_model = xlnet_base.XLNetBase(
-        vocab_size=32000,
-        num_layers=2,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        head_size=2,
-        inner_size=2,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        attention_type="bi",
-        bi_data=True,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        two_stream=False,
-        tie_attention_biases=True,
-        reuse_length=0,
-        inner_activation="relu")
-    input_data = self._generate_data(batch_size=batch_size,
-                                     seq_length=seq_length,
-                                     num_predictions=num_predictions)
-    model_output = xlnet_model(**input_data)
-    self.assertEqual(model_output[0].shape,
-                     (batch_size, seq_length, hidden_size))
-
-  def test_get_config(self):
-    xlnet_model = xlnet_base.XLNetBase(
-        vocab_size=32000,
-        num_layers=12,
-        hidden_size=36,
-        num_attention_heads=12,
-        head_size=12,
-        inner_size=12,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        attention_type="bi",
-        bi_data=True,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        two_stream=False,
-        tie_attention_biases=True,
-        memory_length=0,
-        reuse_length=0,
-        inner_activation="relu")
-    config = xlnet_model.get_config()
-    new_xlnet = xlnet_base.XLNetBase.from_config(config)
-    self.assertEqual(config, new_xlnet.get_config())
-
-
-if __name__ == "__main__":
-  tf.random.set_seed(0)
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/__init__.py
deleted file mode 100644
index c10cfe79b4a4a3c629dbf14886f09d01e0d6f293..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/__init__.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Ops package definition."""
-from official.nlp.modeling.ops.beam_search import sequence_beam_search
-from official.nlp.modeling.ops.segment_extractor import get_next_sentence_labels
-from official.nlp.modeling.ops.segment_extractor import get_sentence_order_labels
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/beam_search.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/beam_search.py
deleted file mode 100644
index a10fbd8b9a52a4c132a00ce8c71b96a0a11fdf1d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/beam_search.py
+++ /dev/null
@@ -1,720 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Beam search to find the translated sequence with the highest probability."""
-
-import numpy as np
-import tensorflow as tf
-
-
-def inf(dtype):
-  """Returns a value close to infinity, but is still finite in `dtype`.
-
-  This is useful to get a very large value that is still zero when multiplied by
-  zero. The floating-point "Inf" value is NaN when multiplied by zero.
-
-  Args:
-    dtype: A dtype. The returned value will be finite when casted to this dtype.
-
-  Returns:
-    A very large value.
-  """
-  if dtype == "float32" or dtype == "bfloat16":
-    return 1e7
-  elif dtype == "float16":
-    # Disable no-member lint error, as the linter thinks np.float16 does not
-    # exist for some reason.
-    return np.finfo(np.float16).max  # pylint: disable=no-member
-  else:
-    raise AssertionError("Invalid dtype: %s" % dtype)
-
-
-class _StateKeys(object):
-  """Keys to dictionary storing the state of the beam search loop."""
-
-  # Variable storing the loop index.
-  CUR_INDEX = "CUR_INDEX"
-
-  # Top sequences that are alive for each batch item. Alive sequences are ones
-  # that have not generated an EOS token. Sequences that reach EOS are marked as
-  # finished and moved to the FINISHED_SEQ tensor.
-  # Has shape [batch_size, beam_size, CUR_INDEX + 1]
-  ALIVE_SEQ = "ALIVE_SEQ"
-  # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
-  ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
-  # Dictionary of cached values for each alive sequence. The cache stores
-  # the encoder output, attention bias, and the decoder attention output from
-  # the previous iteration.
-  ALIVE_CACHE = "ALIVE_CACHE"
-
-  # Top finished sequences for each batch item.
-  # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
-  # shorter than CUR_INDEX + 1 are padded with 0s.
-  FINISHED_SEQ = "FINISHED_SEQ"
-  # Scores for each finished sequence. Score = log probability / length norm
-  # Shape [batch_size, beam_size]
-  FINISHED_SCORES = "FINISHED_SCORES"
-  # Flags indicating which sequences in the finished sequences are finished.
-  # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
-  # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
-  FINISHED_FLAGS = "FINISHED_FLAGS"
-
-
-def _expand_to_same_rank(tensor, target):
-  """Expands a given tensor to target's rank to be broadcastable.
-
-  Args:
-    tensor: input tensor to tile. Shape: [b, d1, ..., da]
-    target: target tensor. Shape: [b, d1, ..., da, ..., dn]
-
-  Returns:
-    Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
-
-  Raises:
-    ValueError, if the shape rank of rank tensor/target is None.
-  """
-  if tensor.shape.rank is None:
-    raise ValueError("Expect rank for tensor shape, but got None.")
-  if target.shape.rank is None:
-    raise ValueError("Expect rank for target shape, but got None.")
-
-  with tf.name_scope("expand_rank"):
-    diff_rank = target.shape.rank - tensor.shape.rank
-    for _ in range(diff_rank):
-      tensor = tf.expand_dims(tensor, -1)
-    return tensor
-
-
-class SequenceBeamSearch(tf.Module):
-  """Implementation of beam search loop."""
-
-  def __init__(self,
-               symbols_to_logits_fn,
-               vocab_size,
-               beam_size,
-               alpha,
-               max_decode_length,
-               eos_id,
-               padded_decode,
-               dtype=tf.float32):
-    """Initialize sequence beam search.
-
-    Args:
-      symbols_to_logits_fn: A function to provide logits, which is the
-        interface to the Transformer model. The passed in arguments are: ids ->
-          A tensor with shape [batch_size * beam_size, index]. index -> A
-          scalar. cache -> A nested dictionary of tensors [batch_size *
-          beam_size, ...].
-        The function must return a tuple of logits and the updated cache: logits
-          -> A tensor with shape [batch * beam_size, vocab_size]. updated cache
-          -> A nested dictionary with the same structure as the input cache.
-      vocab_size: An integer, the size of the vocabulary, used for topk
-        computation.
-      beam_size: An integer, number of beams for beam search.
-      alpha: A float, defining the strength of length normalization.
-      max_decode_length: An integer, the maximum number of steps to decode a
-        sequence.
-      eos_id: An integer. ID of end of sentence token.
-      padded_decode: A bool, indicating if max_sequence_length padding is used
-        for beam search.
-      dtype: A tensorflow data type used for score computation. The default is
-        tf.float32.
-    """
-    self.symbols_to_logits_fn = symbols_to_logits_fn
-    self.vocab_size = vocab_size
-    self.beam_size = beam_size
-    self.alpha = alpha
-    self.max_decode_length = max_decode_length
-    self.eos_id = eos_id
-    self.padded_decode = padded_decode
-    self.dtype = tf.as_dtype(dtype)
-
-  def search(self, initial_ids, initial_cache):
-    """Beam search for sequences with highest scores.
-
-    Args:
-      initial_ids: initial ids to pass into the symbols_to_logits_fn. int tensor
-        with shape [batch_size, 1]
-      initial_cache: dictionary storing values to be passed into the
-        symbols_to_logits_fn.
-
-    Returns:
-      finished_seq and finished_scores.
-    """
-    batch_size = (
-        initial_ids.shape.as_list()[0]
-        if self.padded_decode else tf.shape(initial_ids)[0])
-    state, state_shapes = self._create_initial_state(initial_ids, initial_cache,
-                                                     batch_size)
-
-    def _grow_alive_seq(state):
-      """Grow alive sequences by one token, collect top 2*beam_size sequences.
-
-      2*beam_size sequences are collected because some sequences may have
-      reached the EOS token. 2*beam_size ensures that at least beam_size
-      sequences are still alive.
-
-      Args:
-        state: A dictionary with the current loop state.
-
-      Returns:
-        Tuple of
-        (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
-         Scores of returned sequences [batch_size, 2 * beam_size],
-         New alive cache, for each of the 2 * beam_size sequences)
-      """
-      i = state[_StateKeys.CUR_INDEX]
-      alive_seq = state[_StateKeys.ALIVE_SEQ]
-      alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
-      alive_cache = state[_StateKeys.ALIVE_CACHE]
-
-      beams_to_keep = 2 * self.beam_size
-
-      # Get logits for the next candidate IDs for the alive sequences. Get the
-      # new cache values at the same time.
-      if self.padded_decode:
-        flat_ids = tf.reshape(
-            tf.slice(alive_seq, [0, 0, i], [batch_size, self.beam_size, 1]),
-            [batch_size * self.beam_size, -1])
-      else:
-        flat_ids = flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
-      flat_cache = tf.nest.map_structure(flatten_beam_dim, alive_cache)
-
-      flat_logits, flat_cache = self.symbols_to_logits_fn(
-          flat_ids, i, flat_cache)
-
-      # Unflatten logits to shape [batch_size, beam_size, vocab_size]
-      logits = _unflatten_beam_dim(flat_logits, batch_size, self.beam_size)
-      new_cache = tf.nest.map_structure(
-          lambda t: _unflatten_beam_dim(t, batch_size, self.beam_size),
-          flat_cache)
-
-      # Convert logits to normalized log probs
-      candidate_log_probs = _log_prob_from_logits(logits)
-
-      # Calculate new log probabilities if each of the alive sequences were
-      # extended # by the the candidate IDs.
-      # Shape [batch_size, beam_size, vocab_size]
-      log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
-
-      # Each batch item has beam_size * vocab_size candidate sequences. For each
-      # batch item, get the k candidates with the highest log probabilities.
-      flat_log_probs = tf.reshape(log_probs,
-                                  [-1, self.beam_size * self.vocab_size])
-      topk_log_probs, topk_indices = tf.nn.top_k(
-          flat_log_probs, k=beams_to_keep)
-
-      # Extract the alive sequences that generate the highest log probabilities
-      # after being extended.
-      topk_beam_indices = topk_indices // self.vocab_size
-      topk_seq, new_cache = self._gather_beams([alive_seq, new_cache],
-                                               topk_beam_indices, batch_size,
-                                               beams_to_keep)
-
-      # Append the most probable IDs to the topk sequences
-      topk_ids = topk_indices % self.vocab_size
-      if self.padded_decode:
-        topk_seq = tf.transpose(topk_seq, perm=[2, 0, 1])
-        # TODO(b/145533236, hongkuny): Reverts once TF fix the validation.
-        topk_seq = tf.tensor_scatter_nd_update(topk_seq, [[i + 1]],
-                                               tf.expand_dims(topk_ids, axis=0))
-        topk_seq = tf.transpose(topk_seq, perm=[1, 2, 0])
-      else:
-        topk_seq = tf.concat(
-            [topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
-      return topk_seq, topk_log_probs, topk_ids, new_cache
-
-    def _get_new_alive_state(new_seq, new_log_probs, new_finished_flags,
-                             new_cache):
-      """Gather the top k sequences that are still alive.
-
-      Args:
-        new_seq: New sequences generated by growing the current alive sequences
-          int32 tensor with shape [batch_size, 2 * beam_size, cur_index + 1]
-        new_log_probs: Log probabilities of new sequences float32 tensor with
-          shape [batch_size, beam_size]
-        new_finished_flags: A boolean Tensor indicates which sequences are live
-          inside the beam.
-        new_cache: Dict of cached values for each sequence.
-
-      Returns:
-        Dictionary with alive keys from _StateKeys:
-          {Top beam_size sequences that are still alive (don't end with eos_id)
-           Log probabilities of top alive sequences
-           Dict cache storing decoder states for top alive sequences}
-      """
-      # To prevent finished sequences from being considered, set log probs to
-      # -inf.
-      new_log_probs += tf.cast(new_finished_flags,
-                               self.dtype) * -inf(self.dtype)
-
-      _, topk_indexes = tf.nn.top_k(new_log_probs, k=self.beam_size)
-      top_alive_seq, top_alive_log_probs, top_alive_cache = (
-          self._gather_beams([new_seq, new_log_probs, new_cache],
-                             topk_indexes, batch_size, self.beam_size))
-
-      return {
-          _StateKeys.ALIVE_SEQ: top_alive_seq,
-          _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
-          _StateKeys.ALIVE_CACHE: top_alive_cache
-      }
-
-    def _get_new_finished_state(state, new_seq, new_log_probs,
-                                new_finished_flags):
-      """Combine new and old finished sequences, and gather the top k sequences.
-
-      Args:
-        state: A dictionary with the current loop state.
-        new_seq: New sequences generated by growing the current alive sequences
-          int32 tensor with shape [batch_size, beam_size, i + 1]
-        new_log_probs: Log probabilities of new sequences float32 tensor with
-          shape [batch_size, beam_size]
-        new_finished_flags: A boolean Tensor indicates which sequences are live
-          inside the beam.
-
-      Returns:
-        Dictionary with finished keys from _StateKeys:
-          {Top beam_size finished sequences based on score,
-           Scores of finished sequences,
-           Finished flags of finished sequences}
-      """
-      i = state[_StateKeys.CUR_INDEX]
-      finished_seq = state[_StateKeys.FINISHED_SEQ]
-      finished_scores = state[_StateKeys.FINISHED_SCORES]
-      finished_flags = state[_StateKeys.FINISHED_FLAGS]
-
-      # First append a column of 0-ids to finished_seq to increment the length.
-      # New shape of finished_seq: [batch_size, beam_size, i + 1]
-      if not self.padded_decode:
-        finished_seq = tf.concat(
-            [finished_seq,
-             tf.zeros([batch_size, self.beam_size, 1], tf.int32)],
-            axis=2)
-
-      # Calculate new seq scores from log probabilities.
-      length_norm = _length_normalization(self.alpha, i + 1, dtype=self.dtype)
-      new_scores = new_log_probs / length_norm
-
-      # Set the scores of the still-alive seq in new_seq to large negative
-      # values.
-      new_scores += ((1. - tf.cast(new_finished_flags, self.dtype)) *
-                     -inf(self.dtype))
-
-      # Combine sequences, scores, and flags.
-      finished_seq = tf.concat([finished_seq, new_seq], axis=1)
-      finished_scores = tf.concat([finished_scores, new_scores], axis=1)
-      finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
-
-      # Return the finished sequences with the best scores.
-      _, topk_indexes = tf.nn.top_k(finished_scores, k=self.beam_size)
-      top_finished_seq, top_finished_scores, top_finished_flags = (
-          self._gather_beams([finished_seq, finished_scores, finished_flags],
-                             topk_indexes, batch_size, self.beam_size))
-
-      return {
-          _StateKeys.FINISHED_SEQ: top_finished_seq,
-          _StateKeys.FINISHED_SCORES: top_finished_scores,
-          _StateKeys.FINISHED_FLAGS: top_finished_flags
-      }
-
-    def _search_step(state):
-      """Beam search loop body.
-
-      Grow alive sequences by a single ID. Sequences that have reached the EOS
-      token are marked as finished. The alive and finished sequences with the
-      highest log probabilities and scores are returned.
-
-      A sequence's finished score is calculating by dividing the log probability
-      by the length normalization factor. Without length normalization, the
-      search is more likely to return shorter sequences.
-
-      Args:
-        state: A dictionary with the current loop state.
-
-      Returns:
-        new state dictionary.
-      """
-      # Grow alive sequences by one token.
-      new_seq, new_log_probs, topk_ids, new_cache = _grow_alive_seq(state)
-      new_finished_flags = tf.equal(topk_ids, self.eos_id)
-      # Collect top beam_size alive sequences
-      alive_state = _get_new_alive_state(new_seq, new_log_probs,
-                                         new_finished_flags, new_cache)
-
-      # Combine newly finished sequences with existing finished sequences, and
-      # collect the top k scoring sequences.
-      finished_state = _get_new_finished_state(state, new_seq, new_log_probs,
-                                               new_finished_flags)
-
-      # Increment loop index and create new state dictionary
-      new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
-      new_state.update(alive_state)
-      new_state.update(finished_state)
-      return [new_state]
-
-    finished_state = tf.nest.map_structure(
-        tf.stop_gradient,
-        tf.while_loop(
-            self._continue_search,
-            _search_step,
-            loop_vars=[state],
-            shape_invariants=[state_shapes],
-            parallel_iterations=1))
-    finished_state = finished_state[0]
-    return self._process_finished_state(finished_state)
-
-  def _process_finished_state(self, finished_state):
-    alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
-    alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
-    finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
-    finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
-    finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
-    # TF2 changes tf.where behavior. Should make parameters broadcastable.
-    finished_cond = tf.reduce_any(finished_flags, 1, name="finished_cond")
-    seq_cond = _expand_to_same_rank(finished_cond, finished_seq)
-    score_cond = _expand_to_same_rank(finished_cond, finished_scores)
-
-    # Account for corner case where there are no finished sequences for a
-    # particular batch item. In that case, return alive sequences for that batch
-    # item.
-    finished_seq = tf.where(seq_cond, finished_seq, alive_seq)
-    finished_scores = tf.where(score_cond, finished_scores, alive_log_probs)
-    return finished_seq, finished_scores
-
-  def _create_initial_state(self, initial_ids, initial_cache, batch_size):
-    """Return initial state dictionary and its shape invariants."""
-    for key, value in initial_cache.items():
-      for inner_value in tf.nest.flatten(value):
-        if inner_value.dtype != self.dtype:
-          raise TypeError(
-              "initial_cache element for key '%s' has dtype %s that does not "
-              "match SequenceBeamSearch's dtype of %s. Value: %s" %
-              (key, inner_value.dtype.name, self.dtype.name, inner_value))
-
-    # Current loop index (starts at 0)
-    cur_index = tf.constant(0)
-
-    # Create alive sequence with shape [batch_size, beam_size, 1]
-    alive_seq = expand_to_beam_size(initial_ids, self.beam_size)
-    alive_seq = tf.expand_dims(alive_seq, axis=2)
-    if self.padded_decode:
-      alive_seq = tf.tile(alive_seq, [1, 1, self.max_decode_length + 1])
-
-    # Create tensor for storing initial log probabilities.
-    # Assume initial_ids are prob 1.0
-    initial_log_probs = tf.constant([[0.] + [-float("inf")] *
-                                     (self.beam_size - 1)],
-                                    dtype=self.dtype)
-    alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
-
-    # Expand all values stored in the dictionary to the beam size, so that each
-    # beam has a separate cache.
-    alive_cache = tf.nest.map_structure(
-        lambda t: expand_to_beam_size(t, self.beam_size), initial_cache)
-
-    # Initialize tensor storing finished sequences with filler values.
-    finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
-
-    # Set scores of the initial finished seqs to negative infinity.
-    finished_scores = tf.ones([batch_size, self.beam_size],
-                              dtype=self.dtype) * -inf(self.dtype)
-
-    # Initialize finished flags with all False values.
-    finished_flags = tf.zeros([batch_size, self.beam_size], tf.bool)
-
-    # Create state dictionary
-    state = {
-        _StateKeys.CUR_INDEX: cur_index,
-        _StateKeys.ALIVE_SEQ: alive_seq,
-        _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
-        _StateKeys.ALIVE_CACHE: alive_cache,
-        _StateKeys.FINISHED_SEQ: finished_seq,
-        _StateKeys.FINISHED_SCORES: finished_scores,
-        _StateKeys.FINISHED_FLAGS: finished_flags
-    }
-
-    # Create state invariants for each value in the state dictionary. Each
-    # dimension must be a constant or None. A None dimension means either:
-    #   1) the dimension's value is a tensor that remains the same but may
-    #      depend on the input sequence to the model (e.g. batch size).
-    #   2) the dimension may have different values on different iterations.
-    if self.padded_decode:
-      state_shape_invariants = {
-          _StateKeys.CUR_INDEX:
-              tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ:
-              tf.TensorShape(
-                  [batch_size, self.beam_size, self.max_decode_length + 1]),
-          _StateKeys.ALIVE_LOG_PROBS:
-              tf.TensorShape([batch_size, self.beam_size]),
-          _StateKeys.ALIVE_CACHE:
-              tf.nest.map_structure(lambda state: state.get_shape(),
-                                    alive_cache),
-          _StateKeys.FINISHED_SEQ:
-              tf.TensorShape(
-                  [batch_size, self.beam_size, self.max_decode_length + 1]),
-          _StateKeys.FINISHED_SCORES:
-              tf.TensorShape([batch_size, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS:
-              tf.TensorShape([batch_size, self.beam_size])
-      }
-    else:
-      state_shape_invariants = {
-          _StateKeys.CUR_INDEX:
-              tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ:
-              tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.ALIVE_LOG_PROBS:
-              tf.TensorShape([None, self.beam_size]),
-          _StateKeys.ALIVE_CACHE:
-              tf.nest.map_structure(_get_shape_keep_last_dim, alive_cache),
-          _StateKeys.FINISHED_SEQ:
-              tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.FINISHED_SCORES:
-              tf.TensorShape([None, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS:
-              tf.TensorShape([None, self.beam_size])
-      }
-
-    return state, state_shape_invariants
-
-  def _continue_search(self, state):
-    """Return whether to continue the search loop.
-
-    The loops should terminate when
-      1) when decode length has been reached, or
-      2) when the worst score in the finished sequences is better than the best
-         score in the alive sequences (i.e. the finished sequences are provably
-         unchanging)
-
-    Args:
-      state: A dictionary with the current loop state.
-
-    Returns:
-      Bool tensor with value True if loop should continue, False if loop should
-      terminate.
-    """
-    i = state[_StateKeys.CUR_INDEX]
-    alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
-    finished_scores = state[_StateKeys.FINISHED_SCORES]
-    finished_flags = state[_StateKeys.FINISHED_FLAGS]
-
-    not_at_max_decode_length = tf.less(i, self.max_decode_length)
-
-    # Calculate largest length penalty (the larger penalty, the better score).
-    max_length_norm = _length_normalization(
-        self.alpha, self.max_decode_length, dtype=self.dtype)
-    # Get the best possible scores from alive sequences.
-    # This tf.slice/tf.squeeze is equivalent to alive_log_probs[:, 0] which
-    # emits a tf.strided_slice. tf.slice is easier to reason about as we aren't
-    # actually taking a non trivial stride.
-    best_alive_scores = tf.squeeze(tf.slice(alive_log_probs, [0, 0], [-1, 1]),
-                                   axis=1) / max_length_norm
-
-    # Compute worst score in finished sequences for each batch element
-    finished_scores *= tf.cast(finished_flags,
-                               self.dtype)  # set filler scores to zero
-    lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
-
-    # If there are no finished sequences in a batch element, then set the lowest
-    # finished score to -INF for that element.
-    finished_batches = tf.reduce_any(finished_flags, 1)
-    lowest_finished_scores += ((1.0 - tf.cast(finished_batches, self.dtype)) *
-                               -inf(self.dtype))
-
-    worst_finished_score_better_than_best_alive_score = tf.reduce_all(
-        tf.greater(lowest_finished_scores, best_alive_scores))
-
-    return tf.logical_and(
-        not_at_max_decode_length,
-        tf.logical_not(worst_finished_score_better_than_best_alive_score))
-
-  @staticmethod
-  def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
-    """Gather beams from nested structure of tensors.
-
-    Each tensor in nested represents a batch of beams, where beam refers to a
-    single search state (beam search involves searching through multiple states
-    in parallel).
-
-    This function is used to gather the top beams, specified by
-    beam_indices, from the nested tensors.
-
-    Args:
-      nested: Nested structure (tensor, list, tuple or dict) containing tensors
-        with shape [batch_size, beam_size, ...].
-      beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
-        value in beam_indices must be between [0, beam_size), and are not
-        necessarily unique.
-      batch_size: int size of batch
-      new_beam_size: int number of beams to be pulled from the nested tensors.
-
-    Returns:
-      Nested structure containing tensors with shape
-        [batch_size, new_beam_size, ...]
-    """
-    # Computes the i'th coodinate that contains the batch index for gather_nd.
-    # Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..].
-    batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size
-    batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size])
-
-    # Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor
-    # with shape [batch_size, beam_size, 2], where the last dimension contains
-    # the (i, j) gathering coordinates.
-    coordinates = tf.stack([batch_pos, beam_indices], axis=2)
-
-    return tf.nest.map_structure(lambda state: tf.gather_nd(state, coordinates),
-                                 nested)
-
-
-def sequence_beam_search(symbols_to_logits_fn,
-                         initial_ids,
-                         initial_cache,
-                         vocab_size,
-                         beam_size,
-                         alpha,
-                         max_decode_length,
-                         eos_id,
-                         padded_decode=False,
-                         dtype="float32"):
-  """Search for sequence of subtoken ids with the largest probability.
-
-  Args:
-    symbols_to_logits_fn: A function that takes in ids, index, and cache as
-      arguments. The passed in arguments will have shape: ids -> A tensor with
-        shape [batch_size * beam_size, index]. index -> A scalar. cache -> A
-        nested dictionary of tensors [batch_size * beam_size, ...].
-      The function must return a tuple of logits and new cache: logits -> A
-        tensor with shape [batch * beam_size, vocab_size]. new cache -> A nested
-        dictionary with the same shape/structure as the inputted cache.
-    initial_ids: An int32 tensor with shape [batch_size]. Starting ids for each
-      batch item.
-    initial_cache: A dictionary, containing starting decoder variables
-      information.
-    vocab_size: An integer, the size of tokens.
-    beam_size: An integer, the number of beams.
-    alpha: A float, defining the strength of length normalization.
-    max_decode_length: An integer, the maximum length to decoded a sequence.
-    eos_id: An integer, ID of eos token, used to determine when a sequence has
-      finished.
-    padded_decode: A bool, indicating if max_sequence_length padding is used for
-      beam search.
-    dtype: A tensorflow data type used for score computation. The default is
-      tf.float32.
-
-  Returns:
-    Top decoded sequences [batch_size, beam_size, max_decode_length]
-    sequence scores [batch_size, beam_size]
-  """
-  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, beam_size, alpha,
-                           max_decode_length, eos_id, padded_decode, dtype)
-  return sbs.search(initial_ids, initial_cache)
-
-
-def _log_prob_from_logits(logits):
-  return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True)
-
-
-def _length_normalization(alpha, length, dtype=tf.float32):
-  """Return length normalization factor."""
-  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), alpha)
-
-
-def expand_to_beam_size(tensor, beam_size):
-  """Tiles a given tensor by beam_size.
-
-  Args:
-    tensor: tensor to tile [batch_size, ...]
-    beam_size: How much to tile the tensor by.
-
-  Returns:
-    Tiled tensor [batch_size, beam_size, ...]
-  """
-  tensor = tf.expand_dims(tensor, axis=1)
-  tile_dims = [1] * tensor.shape.ndims
-  tile_dims[1] = beam_size
-
-  return tf.tile(tensor, tile_dims)
-
-
-def flatten_beam_dim(tensor):
-  """Reshapes first two dimensions into a single dimension.
-
-  Args:
-    tensor: Tensor to reshape of shape [A, B, ...]
-
-  Returns:
-    Reshaped tensor of shape [A*B, ...]
-  """
-  shape = _shape_list(tensor)
-  shape[0] *= shape[1]
-  shape.pop(1)  # Remove beam dim
-  return tf.reshape(tensor, shape)
-
-
-def _shape_list(tensor):
-  """Return a list of the tensor's shape, and ensure no None values in list."""
-  # Get statically known shape (may contain None's for unknown dimensions)
-  shape = tensor.get_shape().as_list()
-
-  # Ensure that the shape values are not None
-  dynamic_shape = tf.shape(tensor)
-  for i in range(len(shape)):  # pylint: disable=consider-using-enumerate
-    if shape[i] is None:
-      shape[i] = dynamic_shape[i]
-  return shape
-
-
-def _get_shape_keep_last_dim(tensor):
-  shape_list = _shape_list(tensor)
-
-  # Only the last
-  for i in range(len(shape_list) - 1):
-    shape_list[i] = None
-
-  if isinstance(shape_list[-1], tf.Tensor):
-    shape_list[-1] = None
-  return tf.TensorShape(shape_list)
-
-
-def _unflatten_beam_dim(tensor, batch_size, beam_size):
-  """Reshapes first dimension back to [batch_size, beam_size].
-
-  Args:
-    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
-    batch_size: Tensor, original batch size.
-    beam_size: int, original beam size.
-
-  Returns:
-    Reshaped tensor of shape [batch_size, beam_size, ...]
-  """
-  shape = _shape_list(tensor)
-  new_shape = [batch_size, beam_size] + shape[1:]
-  return tf.reshape(tensor, new_shape)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/beam_search_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/beam_search_test.py
deleted file mode 100644
index 031f8f86e9214825afc2a31ebda228dfe38d844a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/beam_search_test.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Test beam search helper methods."""
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.nlp.modeling.ops import beam_search
-
-
-class BeamSearchTests(tf.test.TestCase, parameterized.TestCase):
-
-  def test_expand_to_beam_size(self):
-    x = tf.ones([7, 4, 2, 5])
-    x = beam_search.expand_to_beam_size(x, 3)
-    shape = tf.shape(x)
-    self.assertAllEqual([7, 3, 4, 2, 5], shape)
-
-  def test_get_shape_keep_last_dim(self):
-    y = tf.constant(4.0)
-    x = tf.ones([7, tf.cast(tf.sqrt(y), tf.int32), 2, 5])
-    shape = beam_search._get_shape_keep_last_dim(x)
-    self.assertAllEqual([None, None, None, 5], shape.as_list())
-
-  def test_flatten_beam_dim(self):
-    x = tf.ones([7, 4, 2, 5])
-    x = beam_search.flatten_beam_dim(x)
-    self.assertAllEqual([28, 2, 5], tf.shape(x))
-
-  def test_unflatten_beam_dim(self):
-    x = tf.ones([28, 2, 5])
-    x = beam_search._unflatten_beam_dim(x, 7, 4)
-    self.assertAllEqual([7, 4, 2, 5], tf.shape(x))
-
-  def test_gather_beams(self):
-    x = tf.reshape(tf.range(24), [2, 3, 4])
-    # x looks like:  [[[ 0  1  2  3]
-    #                  [ 4  5  6  7]
-    #                  [ 8  9 10 11]]
-    #
-    #                 [[12 13 14 15]
-    #                  [16 17 18 19]
-    #                  [20 21 22 23]]]
-
-    y = beam_search.SequenceBeamSearch._gather_beams(x, [[1, 2], [0, 2]], 2, 2)
-    self.assertAllEqual(
-        [[[4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [20, 21, 22, 23]]],
-        y)
-
-  @parameterized.named_parameters([
-      ('padded_decode_true', True),
-      ('padded_decode_false', False),
-  ])
-  def test_sequence_beam_search(self, padded_decode):
-    # batch_size*beam_size, max_decode_length, vocab_size
-    probabilities = tf.constant([[[0.2, 0.7, 0.1], [0.5, 0.3, 0.2],
-                                  [0.1, 0.8, 0.1]],
-                                 [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3],
-                                  [0.2, 0.1, 0.7]]])
-    # batch_size, max_decode_length, num_heads, embed_size per head
-    x = tf.zeros([1, 3, 2, 32], dtype=tf.float32)
-    cache = {'layer_%d' % layer: {'k': x, 'v': x} for layer in range(2)}
-
-    def _get_test_symbols_to_logits_fn():
-      """Test function that returns logits for next token."""
-
-      def symbols_to_logits_fn(_, i, cache):
-        logits = tf.cast(probabilities[:, i, :], tf.float32)
-        return logits, cache
-      return symbols_to_logits_fn
-
-    predictions, _ = beam_search.sequence_beam_search(
-        symbols_to_logits_fn=_get_test_symbols_to_logits_fn(),
-        initial_ids=tf.zeros([1], dtype=tf.int32),
-        initial_cache=cache,
-        vocab_size=3,
-        beam_size=2,
-        alpha=0.6,
-        max_decode_length=3,
-        eos_id=9,
-        padded_decode=padded_decode,
-        dtype=tf.float32)
-    self.assertAllEqual([[[0, 1, 0, 1], [0, 1, 1, 2]]], predictions)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/decoding_module.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/decoding_module.py
deleted file mode 100644
index b268af0851c2fa37551343209d38b92ef23711d9..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/decoding_module.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Base class for Decoding Strategies (beam_search, top_k, top_p and greedy)."""
-
-import abc
-from typing import Any, Callable, Dict, Tuple
-
-import tensorflow as tf
-
-from tensorflow.python.framework import dtypes
-from official.modeling import tf_utils
-
-Output = Tuple[tf.Tensor, tf.Tensor]
-InternalState = Tuple[tf.Tensor, tf.Tensor, tf.Tensor, Dict]
-InitialState = Tuple[Dict[str, Any], Dict[str, Any]]
-
-
-class StateKeys:
-  """Keys to dictionary storing the state of Decoding loop."""
-
-  # Variable storing the loop index.
-  CUR_INDEX = "CUR_INDEX"
-
-  # Top sequences that are alive for each batch item. Alive sequences are ones
-  # that have not generated an EOS token. Sequences that reach EOS are marked as
-  # finished and moved to the FINISHED_SEQ tensor.
-  # Has shape [batch_size, beam_size, CUR_INDEX + 1] for SequenceBeamSearch and
-  # [batch_size, CUR_INDEX + 1] otherwise.
-  ALIVE_SEQ = "ALIVE_SEQ"
-  # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
-  ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
-  # Dictionary of cached values for each alive sequence. The cache stores
-  # the encoder output, attention bias, and the decoder attention output from
-  # the previous iteration.
-  ALIVE_CACHE = "ALIVE_CACHE"
-
-  # Top finished sequences for each batch item.
-  # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
-  # shorter than CUR_INDEX + 1 are padded with 0s.
-  FINISHED_SEQ = "FINISHED_SEQ"
-  # Scores for each finished sequence. Score = log probability / length norm
-  # Shape [batch_size, beam_size]
-  FINISHED_SCORES = "FINISHED_SCORES"
-  # Flags indicating which sequences in the finished sequences are finished.
-  # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
-  # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
-  FINISHED_FLAGS = "FINISHED_FLAGS"
-
-
-def log_prob_from_logits(logits):
-  return logits - tf.reduce_logsumexp(logits, axis=-1, keepdims=True)
-
-
-def shape_list(tensor):
-  """Return a list of the tensor's shape, and ensure no None values in list."""
-  return tf_utils.get_shape_list(tensor)
-
-
-def get_shape_keep_last_dim(tensor):
-  shape_list_obj = shape_list(tensor)
-  for i in range(len(shape_list_obj) - 1):
-    shape_list_obj[i] = None
-
-  if isinstance(shape_list_obj[-1], tf.Tensor):
-    shape_list_obj[-1] = None
-  return tf.TensorShape(shape_list_obj)
-
-
-def expand_to_same_rank(tensor, target):
-  """Expands a given tensor to target's rank to be broadcastable.
-
-  Args:
-    tensor: input tensor to tile. Shape: [b, d1, ..., da]
-    target: target tensor. Shape: [b, d1, ..., da, ..., dn]
-
-  Returns:
-    Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target
-
-  Raises:
-    ValueError, if the shape rank of rank tensor/target is None.
-  """
-  if tensor.shape.rank is None:
-    raise ValueError("Expect rank for tensor shape, but got None.")
-  if target.shape.rank is None:
-    raise ValueError("Expect rank for target shape, but got None.")
-
-  with tf.name_scope("expand_rank"):
-    diff_rank = target.shape.rank - tensor.shape.rank
-    for _ in range(diff_rank):
-      tensor = tf.expand_dims(tensor, -1)
-    return tensor
-
-
-class DecodingModule(tf.Module, metaclass=abc.ABCMeta):
-  """A base class for the API required for decoding (go/decoding-tf-nlp)."""
-
-  def __init__(self,
-               length_normalization_fn: Callable[[int, tf.DType], float],
-               dtype: tf.DType = tf.float32):
-    """Initialize the Decoding Module.
-
-    Args:
-      length_normalization_fn: Closure for returning length normalization
-      parameter. Function accepts input as length, dtype and returns float.
-      dtype: A tensorflow data type used for score computation. The default is
-        tf.float32.
-    """
-    self.length_normalization_fn = length_normalization_fn
-    self.dtype = tf.as_dtype(dtype)
-
-  def generate(self,
-               initial_ids: tf.Tensor,
-               initial_cache: Dict[str, tf.Tensor]) -> Output:
-    """Implements the decoding strategy (beam_search or sampling).
-
-    Args:
-      initial_ids: initial ids to pass into the symbols_to_logits_fn.
-                   int tensor with shape [batch_size, 1]
-      initial_cache: dictionary for caching model outputs from previous step.
-    Returns:
-      Tuple of tensors representing
-        finished_sequence: shape [batch, max_seq_length]
-        finished_scores: [batch]
-    """
-    batch_size = (
-        initial_ids.shape.as_list()[0]
-        if self.padded_decode else tf.shape(initial_ids)[0])
-
-    state, state_shapes = self._create_initial_state(initial_ids,
-                                                     initial_cache,
-                                                     batch_size)
-
-    def _generate_step(state):
-      topk_seq, topk_log_probs, topk_ids, new_cache = self._grow_alive_seq(
-          state, batch_size)
-      new_finished_flags = self._finished_flags(topk_ids, state)
-      alive_state = self._get_new_alive_state(topk_seq,
-                                              topk_log_probs,
-                                              new_finished_flags,
-                                              new_cache)
-      finished_state = self._get_new_finished_state(state,
-                                                    topk_seq,
-                                                    topk_log_probs,
-                                                    new_finished_flags,
-                                                    batch_size)
-      new_state = {
-          StateKeys.CUR_INDEX: state[StateKeys.CUR_INDEX] + 1
-      }
-      new_state.update(alive_state)
-      new_state.update(finished_state)
-      return [new_state]
-
-    finished_state = tf.nest.map_structure(
-        tf.stop_gradient,
-        tf.while_loop(
-            self._continue_search,
-            _generate_step,
-            loop_vars=[state],
-            shape_invariants=[state_shapes],
-            parallel_iterations=1))
-    final_state = self._process_finished_state(finished_state[0])
-    return final_state
-
-  @abc.abstractmethod
-  def _create_initial_state(self,
-                            initial_ids: tf.Tensor,
-                            initial_cache: Dict[str, tf.Tensor],
-                            batch_size: int) -> InitialState:
-    """Return initial state dictionary and its shape invariants."""
-    pass
-
-  @abc.abstractmethod
-  def _grow_alive_seq(self,
-                      state: Dict[str, Any],
-                      batch_size: int) -> InternalState:
-    """Grow alive sequences by one token.
-
-    Args:
-      state: A dictionary with the current loop state.
-      batch_size: The given batch size
-
-    Returns:
-      Tuple of
-      (Top sequences,
-       Scores of returned sequences,
-       New ids,
-       New alive cache)
-    """
-    pass
-
-  @abc.abstractmethod
-  def _get_new_alive_state(
-      self,
-      new_seq: tf.Tensor,
-      new_log_probs: tf.Tensor,
-      new_finished_flags: tf.Tensor,
-      new_cache: Dict[str, tf.Tensor]) -> Dict[str, Any]:
-    """Gather the sequences that are still alive.
-
-    Args:
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor with shape
-      new_log_probs: Log probabilities of new sequences float32 tensor with
-        shape
-      new_finished_flags: A boolean Tensor indicates which sequences are live.
-      new_cache: Dict of cached values for each sequence.
-
-    Returns:
-      Dictionary with alive keys from StateKeys.
-    """
-    pass
-
-  @abc.abstractmethod
-  def _get_new_finished_state(self,
-                              state: Dict[str, Any],
-                              new_seq: tf.Tensor,
-                              new_log_probs: tf.Tensor,
-                              new_finished_flags: tf.Tensor,
-                              batch_size: int) -> Dict[str, tf.Tensor]:
-    """Combine new and old finished sequences.
-
-    Args:
-      state: A dictionary with the current loop state.
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor.
-      new_log_probs: Log probabilities of new sequences float32 tensor with
-        shape.
-      new_finished_flags: A boolean Tensor indicates which sequences are live.
-      batch_size: The given batch size.
-
-    Returns:
-      Dictionary with finished keys from StateKeys.
-    """
-    pass
-
-  @abc.abstractmethod
-  def _process_finished_state(self, finished_state: Dict[str, Any]) -> Output:
-    """Process the alive/finished state to return final sequences and scores."""
-    pass
-
-  @abc.abstractmethod
-  def _continue_search(self, state: Dict[str, Any]) -> tf.Tensor:
-    """Returns a bool tensor if the decoding loop should continue."""
-    pass
-
-  @abc.abstractmethod
-  def _finished_flags(self,
-                      topk_ids: tf.Tensor,
-                      state: Dict[str, Any]) -> tf.Tensor:
-    """Calculate the finished flags."""
-    pass
-
-  def inf(self):
-    """Returns a value close to infinity, but is still finite in `dtype`.
-
-    This is useful to get a very large value that is still zero when multiplied
-    by zero. The floating-point "Inf" value is NaN when multiplied by zero.
-
-    Returns:
-      A very large value.
-    """
-    if self.dtype == dtypes.float32 or self.dtype == dtypes.bfloat16:
-      return 1e7
-    elif self.dtype == dtypes.float16:
-      return dtypes.float16.max
-    else:
-      raise AssertionError("Invalid dtype: %s" % self.dtype)
-
-
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/decoding_module_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/decoding_module_test.py
deleted file mode 100644
index abf6711e14c71dfcbd4d084723f739ab8da2c23d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/decoding_module_test.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Test decoding utility methods."""
-
-import abc
-import tensorflow as tf
-
-from official.nlp.modeling.ops import decoding_module
-
-
-def length_normalization(length, dtype):
-  """Return length normalization factor."""
-  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)
-
-
-class TestSubclass(decoding_module.DecodingModule, metaclass=abc.ABCMeta):
-
-  def __init__(self,
-               length_normalization_fn=length_normalization,
-               dtype=tf.float32):
-    super(TestSubclass, self).__init__(
-        length_normalization_fn=length_normalization, dtype=dtype)
-
-  def _create_initial_state(self, initial_ids, initial_cache, batch_size):
-    pass
-
-  def _grow_alive_seq(self, state, batch_size):
-    pass
-
-  def _process_finished_state(self, finished_state):
-    pass
-
-  def _get_new_finished_state(self, state, new_seq, new_log_probs,
-                              new_finished_flags, batch_size):
-    pass
-
-  def _finished_flags(self, topk_ids, state):
-    pass
-
-  def _continue_search(self, state):
-    pass
-
-  def _get_new_alive_state(self, new_seq, new_log_probs, new_finished_flags,
-                           new_cache):
-    pass
-
-
-class DecodingModuleTest(tf.test.TestCase):
-
-  def test_get_shape_keep_last_dim(self):
-    y = tf.constant(4.0)
-    x = tf.ones([7, tf.cast(tf.sqrt(y), tf.int32), 2, 5])
-    shape = decoding_module.get_shape_keep_last_dim(x)
-    self.assertAllEqual([None, None, None, 5], shape.as_list())
-
-  def test_shape_list(self):
-    x = tf.ones([7, 1])
-    shape = decoding_module.shape_list(x)
-    self.assertAllEqual([7, 1], shape)
-
-  def test_inf(self):
-    d = TestSubclass()
-    inf_value = d.inf()
-    self.assertAllEqual(inf_value, tf.constant(10000000., tf.float32))
-
-  def test_length_normalization(self):
-    d = TestSubclass()
-    normalized_length = d.length_normalization_fn(32, tf.float32)
-    self.assertAllEqual(normalized_length, tf.constant(1.0, tf.float32))
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/sampling_module.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/sampling_module.py
deleted file mode 100644
index b84ef0e69b52197c2f93487733ca226f0783ed90..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/sampling_module.py
+++ /dev/null
@@ -1,463 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Sampling module for top_k, top_p and greedy decoding."""
-
-import abc
-from typing import Any, Callable, Dict
-
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.modeling.ops import decoding_module
-
-
-def greedy(log_probs):
-  """Returns the top ids and scores based on greedy decoding."""
-  log_probs, ids = tf.math.top_k(log_probs, k=1)
-  return log_probs, ids
-
-
-def sample_logits_with_temperature(logits, temperature):
-  """Applies a sampling temperature.
-
-     Temperature skews the distribution towards high probability
-     tokens and lowers the mass in tail distribution.
-
-  Args:
-    logits: Input logits for next token.
-    temperature: Tensor for specifying the sampling temperature.
-
-  Returns:
-    Logits with applied temperature.
-  """
-  return logits / temperature
-
-
-def sample_top_k(logits, top_k):
-  """Chooses top_k logits and sets the others to negative infinity.
-
-  Args:
-    logits: Input logits for next token.
-    top_k: Tensor to specify the top_k values.
-
-  Returns:
-    Logits with top_k filtering applied.
-  """
-  top_k_logits = tf.math.top_k(logits, k=top_k)
-  indices_to_remove = logits < tf.expand_dims(top_k_logits[0][..., -1], -1)
-  top_k_logits = set_tensor_by_indices_to_value(logits, indices_to_remove,
-                                                np.NINF)
-  return top_k_logits
-
-
-def sample_top_p(logits, top_p):
-  """Chooses most probable logits with cumulative probabilities upto top_p.
-
-  Sets the remaining logits to negative infinity.
-
-  Args:
-    logits: Input logits for next token.
-    top_p: Float tensor with a value >=0 and < 1.0
-
-  Returns:
-    Logits with top_p filtering applied.
-  """
-  sorted_indices = tf.argsort(logits, direction="DESCENDING")
-  # Flatten logits as tf.gather on TPU needs axis to be compile time constant.
-  logits_shape = decoding_module.shape_list(logits)
-  range_for_gather = tf.expand_dims(tf.range(0, logits_shape[0]), axis=1)
-  range_for_gather = tf.tile(range_for_gather * logits_shape[1],
-                             [1, logits_shape[1]]) + sorted_indices
-  flattened_logits = tf.reshape(logits, [-1])
-  flattened_sorted_indices = tf.reshape(range_for_gather, [-1])
-  sorted_logits = tf.reshape(
-      tf.gather(flattened_logits, flattened_sorted_indices),
-      [logits_shape[0], logits_shape[1]])
-  cumulative_probs = tf.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)
-
-  # Remove tokens with cumulative probability above the threshold.
-  sorted_indices_to_remove = cumulative_probs > top_p
-
-  # Shift the indices to the right to keep the first token above threshold.
-  sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1)
-  sorted_indices_to_remove = tf.concat([
-      tf.zeros_like(sorted_indices_to_remove[:, :1]),
-      sorted_indices_to_remove[:, 1:]
-  ], -1)
-
-  # Scatter sorted indices to original indexes.
-  indices_to_remove = scatter_values_on_batch_indices(
-      sorted_indices_to_remove, sorted_indices)
-  top_p_logits = set_tensor_by_indices_to_value(
-      logits, indices_to_remove, np.NINF)
-  return top_p_logits
-
-
-def scatter_values_on_batch_indices(values, batch_indices):
-  """Scatter `values` into a tensor using `batch_indices`.
-
-  Args:
-    values: tensor of shape [batch_size, vocab_size] containing the values to
-      scatter
-    batch_indices: tensor of shape [batch_size, vocab_size] containing the
-      indices to insert (should be a permutation in range(0, n))
-
-  Returns:
-    Tensor of shape [batch_size, vocab_size] with values inserted at
-    batch_indices
-  """
-  tensor_shape = decoding_module.shape_list(batch_indices)
-  broad_casted_batch_dims = tf.reshape(
-      tf.broadcast_to(
-          tf.expand_dims(tf.range(tensor_shape[0]), axis=-1),
-          tensor_shape), [1, -1])
-  pair_indices = tf.transpose(
-      tf.concat([broad_casted_batch_dims,
-                 tf.reshape(batch_indices, [1, -1])], 0))
-  return tf.scatter_nd(pair_indices,
-                       tf.reshape(values, [-1]), tensor_shape)
-
-
-def set_tensor_by_indices_to_value(input_tensor, indices, value):
-  """Where indices is True, set the value in input_tensor to value.
-
-  Args:
-    input_tensor: float (batch_size, dim)
-    indices: bool (batch_size, dim)
-    value: float scalar
-  Returns:
-    output_tensor: same shape as input_tensor.
-  """
-  value_tensor = tf.zeros_like(input_tensor) + value
-  output_tensor = tf.where(indices, value_tensor, input_tensor)
-  return output_tensor
-
-
-class SamplingModule(decoding_module.DecodingModule, metaclass=abc.ABCMeta):
-  """Implementation for sampling strategies (go/decoding-tf-nlp)."""
-
-  def __init__(self,
-               symbols_to_logits_fn,
-               length_normalization_fn: Callable[[int, tf.DType], float],
-               vocab_size: int,
-               max_decode_length: int,
-               eos_id: int,
-               padded_decode: bool,
-               top_k=0,
-               top_p=1.0,
-               sample_temperature=0.0,
-               enable_greedy: bool = True,
-               dtype: tf.DType = tf.float32):
-    """Initialize sampling module."""
-    self.symbols_to_logits_fn = symbols_to_logits_fn
-    self.length_normalization_fn = length_normalization_fn
-    self.eos_id = eos_id
-    self.padded_decode = padded_decode
-    self.dtype = tf.as_dtype(dtype)
-    self.vocab_size = tf.convert_to_tensor(vocab_size, dtype=tf.int32)
-    self.max_decode_length = max_decode_length
-    self.top_k = tf.convert_to_tensor(top_k, dtype=tf.int32)
-    self.top_p = tf.convert_to_tensor(top_p, dtype=tf.float32)
-    self.sample_temperature = tf.convert_to_tensor(sample_temperature,
-                                                   dtype=tf.float32)
-    self.enable_greedy = enable_greedy
-    super(SamplingModule, self).__init__(
-        length_normalization_fn=length_normalization_fn, dtype=dtype)
-
-  def _grow_alive_seq(self,
-                      state: Dict[str, Any],
-                      batch_size: int) -> decoding_module.InternalState:
-    """Grow alive sequences by one token.
-
-    This function will implement the decoding strategies like top_p, top_k
-    and greedy for the choosing the next logit.
-
-    Args:
-      state: A dictionary with the current loop state.
-      batch_size: The given batch size
-
-    Returns:
-      Tuple of
-      (Top sequences [batch, curr_index + 1] or [batch, max_decode_length + 1],
-       Scores of returned sequences [batch, 1],
-       New ids [batch, 1],
-       New alive cache)
-    """
-    i = state[decoding_module.StateKeys.CUR_INDEX]
-    alive_seq = state[decoding_module.StateKeys.ALIVE_SEQ]
-    alive_log_probs = state[decoding_module.StateKeys.ALIVE_LOG_PROBS]
-    alive_cache = state[decoding_module.StateKeys.ALIVE_CACHE]
-
-    if self.padded_decode:
-      ids = tf.slice(alive_seq, [0, i], [batch_size, 1])
-    else:
-      ids = alive_seq
-
-    new_logits, new_cache = self.symbols_to_logits_fn(ids, i, alive_cache)
-    candidate_log_probs = decoding_module.log_prob_from_logits(
-        new_logits)
-    original_log_probs = candidate_log_probs + alive_log_probs
-
-    topk_log_probs, topk_ids = None, None
-    if self.enable_greedy:
-      topk_log_probs, topk_ids = greedy(original_log_probs)
-    else:
-      temperature_fn = sample_logits_with_temperature
-      sampled_logits = tf.cond(
-          self.sample_temperature > 0.0,
-          lambda: temperature_fn(new_logits, self.sample_temperature),
-          lambda: new_logits)
-      sampled_logits = tf.cond(
-          self.top_k > 0,
-          lambda: sample_top_k(sampled_logits, self.top_k),
-          lambda: sampled_logits)
-      sampled_logits = tf.cond(
-          self.top_p < 1,
-          lambda: sample_top_p(sampled_logits, self.top_p),
-          lambda: sampled_logits)
-      topk_ids = tf.random.categorical(
-          sampled_logits, dtype=tf.int32, num_samples=1)
-      topk_log_probs = tf.gather(
-          original_log_probs, topk_ids, axis=1, batch_dims=1)
-    if self.padded_decode:
-      topk_seq = tf.transpose(alive_seq, perm=[1, 0])
-      topk_seq = tf.tensor_scatter_nd_update(
-          topk_seq, [[i + 1]], tf.expand_dims(tf.squeeze(topk_ids, -1), 0))
-      topk_seq = tf.transpose(topk_seq, perm=[1, 0])
-    else:
-      topk_seq = tf.concat([alive_seq, topk_ids], axis=-1)
-    return topk_seq, topk_log_probs, topk_ids, new_cache
-
-  def _create_initial_state(self,
-                            initial_ids: tf.Tensor,
-                            initial_cache: Dict[str, tf.Tensor],
-                            batch_size: int) -> decoding_module.InitialState:
-    """Return initial state dictionary and its shape invariants."""
-    for key, value in initial_cache.items():
-      for inner_value in tf.nest.flatten(value):
-        if inner_value.dtype != self.dtype:
-          raise TypeError(
-              "initial_cache element for key '%s' has dtype %s that does not "
-              "match sampling_module's dtype of %s. Value: %s" %
-              (key, value.dtype.name, self.dtype.name, inner_value))
-
-    # Current loop index (starts at 0)
-    cur_index = tf.constant(0)
-
-    # Alive sequence with shape [batch_size, 1]
-    alive_seq = initial_ids
-    alive_seq = tf.expand_dims(alive_seq, axis=-1)
-    if self.padded_decode:
-      alive_seq = tf.tile(alive_seq, [1, self.max_decode_length + 1])
-
-    # Initial log probabilities with shape [batch_size, 1].
-    initial_log_probs = tf.constant([[0.]], dtype=self.dtype)
-    alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
-
-    alive_cache = initial_cache
-
-    # Initialize tensor storing finished sequences [batch_size, 1, 1].
-    finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
-
-    # Set scores of the initial finished seqs to negative infinity.
-    finished_scores = tf.zeros([batch_size, 1], dtype=self.dtype)
-
-    # Initialize finished flags with all False values.
-    finished_flags = tf.zeros([batch_size, 1], tf.bool)
-
-    # Create state dictionary and state shapes.
-    state = {
-        decoding_module.StateKeys.CUR_INDEX: cur_index,
-        decoding_module.StateKeys.ALIVE_SEQ: alive_seq,
-        decoding_module.StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
-        decoding_module.StateKeys.ALIVE_CACHE: alive_cache,
-        decoding_module.StateKeys.FINISHED_SEQ: finished_seq,
-        decoding_module.StateKeys.FINISHED_SCORES: finished_scores,
-        decoding_module.StateKeys.FINISHED_FLAGS: finished_flags
-    }
-
-    if self.padded_decode:
-      state_shape_invariants = {
-          decoding_module.StateKeys.CUR_INDEX:
-              tf.TensorShape([]),
-          decoding_module.StateKeys.ALIVE_SEQ:
-              tf.TensorShape(
-                  [batch_size, self.max_decode_length + 1]),
-          decoding_module.StateKeys.ALIVE_LOG_PROBS:
-              tf.TensorShape([batch_size, 1]),
-          decoding_module.StateKeys.ALIVE_CACHE:
-              tf.nest.map_structure(lambda state: state.get_shape(),
-                                    alive_cache),
-          decoding_module.StateKeys.FINISHED_SEQ:
-              tf.TensorShape(
-                  [batch_size, self.max_decode_length + 1]),
-          decoding_module.StateKeys.FINISHED_SCORES:
-              tf.TensorShape([batch_size, 1]),
-          decoding_module.StateKeys.FINISHED_FLAGS:
-              tf.TensorShape([batch_size, 1])
-      }
-    else:
-      state_shape_invariants = {
-          decoding_module.StateKeys.CUR_INDEX:
-              tf.TensorShape([]),
-          decoding_module.StateKeys.ALIVE_SEQ:
-              tf.TensorShape([None, None]),
-          decoding_module.StateKeys.ALIVE_LOG_PROBS:
-              tf.TensorShape([None, 1]),
-          decoding_module.StateKeys.ALIVE_CACHE:
-              tf.nest.map_structure(
-                  decoding_module.get_shape_keep_last_dim,
-                  alive_cache),
-          decoding_module.StateKeys.FINISHED_SEQ:
-              tf.TensorShape([None, None]),
-          decoding_module.StateKeys.FINISHED_SCORES:
-              tf.TensorShape([None, 1]),
-          decoding_module.StateKeys.FINISHED_FLAGS:
-              tf.TensorShape([None, 1])
-      }
-
-    return state, state_shape_invariants
-
-  def _get_new_alive_state(
-      self,
-      new_seq: tf.Tensor,
-      new_log_probs: tf.Tensor,
-      new_finished_flags: tf.Tensor,
-      new_cache: Dict[str, tf.Tensor]) -> Dict[str, Any]:
-    """Gather the sequences that are still alive.
-
-    This function resets the sequences in the alive_state that are finished.
-
-    Args:
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor with shape [batch_size, cur_index + 1]
-      new_log_probs: Log probabilities of new sequences float32 tensor with
-        shape [batch_size, 1]
-      new_finished_flags: A boolean Tensor indicates which sequences are live
-        inside the beam.
-      new_cache: Dict of cached values for each sequence.
-
-    Returns:
-      Dictionary with alive keys.
-    """
-    new_seq = tf.multiply(
-        new_seq, tf.cast(tf.logical_not(new_finished_flags), new_seq.dtype))
-    return {
-        decoding_module.StateKeys.ALIVE_SEQ: new_seq,
-        decoding_module.StateKeys.ALIVE_LOG_PROBS: new_log_probs,
-        decoding_module.StateKeys.ALIVE_CACHE: new_cache
-    }
-
-  def _get_new_finished_state(self,
-                              state: Dict[str, Any],
-                              new_seq: tf.Tensor,
-                              new_log_probs: tf.Tensor,
-                              new_finished_flags: tf.Tensor,
-                              batch_size: int) -> Dict[str, tf.Tensor]:
-    """Combine new and old finished sequences.
-
-    Args:
-      state: A dictionary with the current loop state.
-      new_seq: New sequences generated by growing the current alive sequences
-        int32 tensor [batch, curr_index + 1] or [batch, max_decode_length + 1].
-      new_log_probs: Log probabilities of new sequences float32 tensor with
-        shape [batch, 1].
-      new_finished_flags: A boolean Tensor indicates which sequences are live.
-      batch_size: The given batch size.
-
-    Returns:
-      Dictionary with finished keys from StateKeys.
-    """
-    i = state[decoding_module.StateKeys.CUR_INDEX]
-    finished_seq = state[decoding_module.StateKeys.FINISHED_SEQ]
-    finished_scores = state[decoding_module.StateKeys.FINISHED_SCORES]
-    finished_flags = state[decoding_module.StateKeys.FINISHED_FLAGS]
-
-    if not self.padded_decode:
-      finished_seq = tf.concat(
-          [finished_seq, tf.zeros([batch_size, 1], tf.int32)], axis=-1)
-    new_scores = new_log_probs
-    if self.length_normalization_fn is not None:
-      length_norm = self.length_normalization_fn(i + 1, self.dtype)
-      new_scores = new_log_probs / length_norm
-    new_seq = tf.multiply(
-        new_seq, tf.cast(tf.logical_not(finished_flags), new_seq.dtype))
-    new_scores = tf.multiply(
-        new_scores, tf.cast(tf.logical_not(finished_flags), new_scores.dtype))
-
-    finished_seq += tf.multiply(new_seq,
-                                tf.cast(new_finished_flags, new_seq.dtype))
-    finished_scores += tf.multiply(
-        new_scores, tf.cast(new_finished_flags, new_scores.dtype))
-    new_finished_flags = tf.logical_or(new_finished_flags, finished_flags)
-    return {
-        decoding_module.StateKeys.FINISHED_SEQ: finished_seq,
-        decoding_module.StateKeys.FINISHED_SCORES: finished_scores,
-        decoding_module.StateKeys.FINISHED_FLAGS: new_finished_flags
-    }
-
-  def _process_finished_state(
-      self, finished_state: Dict[str, Any]) -> decoding_module.Output:
-    """Process the alive/finished state to return final sequences and scores."""
-    alive_seq = finished_state[decoding_module.StateKeys.ALIVE_SEQ]
-    alive_log_probs = finished_state[decoding_module.StateKeys.ALIVE_LOG_PROBS]
-    finished_seq = finished_state[decoding_module.StateKeys.FINISHED_SEQ]
-    finished_scores = finished_state[decoding_module.StateKeys.FINISHED_SCORES]
-    finished_flags = finished_state[decoding_module.StateKeys.FINISHED_FLAGS]
-    finished_cond = tf.reduce_any(finished_flags, 1, name="finished_cond")
-    if self.length_normalization_fn is not None:
-      length_norm = self.length_normalization_fn(self.max_decode_length + 1,
-                                                 self.dtype)
-      alive_log_probs = alive_log_probs / length_norm
-    seq_cond = decoding_module.expand_to_same_rank(
-        finished_cond, finished_seq)
-    score_cond = decoding_module.expand_to_same_rank(
-        finished_cond, finished_scores)
-    finished_seq = tf.where(seq_cond, finished_seq, alive_seq)
-    finished_scores = tf.where(score_cond, finished_scores, alive_log_probs)
-    return finished_seq, finished_scores
-
-  def _continue_search(self, state) -> tf.Tensor:
-    i = state[decoding_module.StateKeys.CUR_INDEX]
-    return tf.less(i, self.max_decode_length)
-
-  def _finished_flags(self, topk_ids, state) -> tf.Tensor:
-    new_finished_flags = tf.equal(topk_ids, self.eos_id)
-    new_finished_flags = tf.logical_or(
-        new_finished_flags, state[decoding_module.StateKeys.FINISHED_FLAGS])
-    return new_finished_flags
-
-
-
-
-
-
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/segment_extractor.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/segment_extractor.py
deleted file mode 100644
index 118f585aa4477d0315f7919be697e3aa7181aa39..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/segment_extractor.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Module for extracting segments from sentences in documents."""
-
-import tensorflow as tf
-
-
-# Get a random tensor like `positions` and make some decisions
-def _get_random(positions, random_fn):
-  flat_random = random_fn(
-      shape=tf.shape(positions.flat_values),
-      minval=0,
-      maxval=1,
-      dtype=tf.float32)
-  return positions.with_flat_values(flat_random)
-
-
-# For every position j in a row, sample a position preceeding j or
-# a position which is [0, j-1]
-def _random_int_up_to(maxval, random_fn):
-  # Need to cast because the int kernel for uniform doesn't support bcast.
-  # We add one because maxval is exclusive, and this will get rounded down
-  # when we cast back to int.
-  float_maxval = tf.cast(maxval, tf.float32)
-  return tf.cast(
-      random_fn(
-          shape=tf.shape(maxval),
-          minval=tf.zeros_like(float_maxval),
-          maxval=float_maxval),
-      dtype=maxval.dtype)
-
-
-def _random_int_from_range(minval, maxval, random_fn):
-  # Need to cast because the int kernel for uniform doesn't support bcast.
-  # We add one because maxval is exclusive, and this will get rounded down
-  # when we cast back to int.
-  float_minval = tf.cast(minval, tf.float32)
-  float_maxval = tf.cast(maxval, tf.float32)
-  return tf.cast(
-      random_fn(tf.shape(maxval), minval=float_minval, maxval=float_maxval),
-      maxval.dtype)
-
-
-def _sample_from_other_batch(sentences, random_fn):
-  """Samples sentences from other batches."""
-  # other_batch: <int64>[num_sentences]: The batch to sample from for each
-  # sentence.
-  other_batch = random_fn(
-      shape=[tf.size(sentences)],
-      minval=0,
-      maxval=sentences.nrows() - 1,
-      dtype=tf.int64)
-
-  other_batch += tf.cast(other_batch >= sentences.value_rowids(), tf.int64)
-
-  # other_sentence: <int64>[num_sentences]: The sentence within each batch
-  # that we sampled.
-  other_sentence = _random_int_up_to(
-      tf.gather(sentences.row_lengths(), other_batch), random_fn)
-  return sentences.with_values(tf.stack([other_batch, other_sentence], axis=1))
-
-
-def get_sentence_order_labels(sentences,
-                              random_threshold=0.5,
-                              random_next_threshold=0.5,
-                              random_fn=tf.random.uniform):
-  """Extract segments and labels for sentence order prediction (SOP) task.
-
-  Extracts the segment and labels for the sentence order prediction task
-  defined in "ALBERT: A Lite BERT for Self-Supervised Learning of Language
-  Representations" (https://arxiv.org/pdf/1909.11942.pdf)
-
-  Args:
-    sentences: a `RaggedTensor` of shape [batch, (num_sentences)] with string
-      dtype.
-    random_threshold: (optional) A float threshold between 0 and 1, used to
-      determine whether to extract a random, out-of-batch sentence or a
-      suceeding sentence. Higher value favors succeeding sentence.
-    random_next_threshold: (optional) A float threshold between 0 and 1, used to
-      determine whether to extract either a random, out-of-batch, or succeeding
-      sentence or a preceeding sentence. Higher value favors preceeding
-      sentences.
-    random_fn: (optional) An op used to generate random float values.
-
-  Returns:
-    a tuple of (preceeding_or_random_next, is_suceeding_or_random) where:
-      preceeding_or_random_next: a `RaggedTensor` of strings with the same shape
-        as `sentences` and contains either a preceeding, suceeding, or random
-        out-of-batch sentence respective to its counterpart in `sentences` and
-        dependent on its label in `is_preceeding_or_random_next`.
-      is_suceeding_or_random: a `RaggedTensor` of bool values with the
-        same shape as `sentences` and is True if it's corresponding sentence in
-        `preceeding_or_random_next` is a random or suceeding sentence, False
-        otherwise.
-  """
-  # Create a RaggedTensor in the same shape as sentences ([doc, (sentences)])
-  # whose values are index positions.
-  positions = tf.ragged.range(sentences.row_lengths())
-
-  row_lengths_broadcasted = tf.expand_dims(positions.row_lengths(),
-                                           -1) + 0 * positions
-  row_lengths_broadcasted_flat = row_lengths_broadcasted.flat_values
-
-  # Generate indices for all preceeding, succeeding and random.
-  # For every position j in a row, sample a position preceeding j or
-  # a position which is [0, j-1]
-  all_preceding = tf.ragged.map_flat_values(_random_int_up_to, positions,
-                                            random_fn)
-
-  # For every position j, sample a position following j, or a position
-  # which is [j, row_max]
-  all_succeeding = positions.with_flat_values(
-      tf.ragged.map_flat_values(_random_int_from_range,
-                                positions.flat_values + 1,
-                                row_lengths_broadcasted_flat, random_fn))
-
-  # Convert to format that is convenient for `gather_nd`
-  rows_broadcasted = tf.expand_dims(tf.range(sentences.nrows()),
-                                    -1) + 0 * positions
-  all_preceding_nd = tf.stack([rows_broadcasted, all_preceding], -1)
-  all_succeeding_nd = tf.stack([rows_broadcasted, all_succeeding], -1)
-  all_random_nd = _sample_from_other_batch(positions, random_fn)
-
-  # There's a few spots where there is no "preceding" or "succeeding" item (e.g.
-  # first and last sentences in a document). Mark where these are and we will
-  # patch them up to grab a random sentence from another document later.
-  all_zeros = tf.zeros_like(positions)
-  all_ones = tf.ones_like(positions)
-  valid_preceding_mask = tf.cast(
-      tf.concat([all_zeros[:, :1], all_ones[:, 1:]], -1), tf.bool)
-  valid_succeeding_mask = tf.cast(
-      tf.concat([all_ones[:, :-1], all_zeros[:, -1:]], -1), tf.bool)
-
-  # Decide what to use for the segment: (1) random, out-of-batch, (2) preceeding
-  # item, or (3) succeeding.
-  # Should get out-of-batch instead of succeeding item
-  should_get_random = ((_get_random(positions, random_fn) > random_threshold)
-                       | tf.logical_not(valid_succeeding_mask))
-  random_or_succeeding_nd = tf.compat.v1.where(should_get_random, all_random_nd,
-                                               all_succeeding_nd)
-  # Choose which items should get a random succeeding item. Force positions that
-  # don't have a valid preceeding items to get a random succeeding item.
-  should_get_random_or_succeeding = (
-      (_get_random(positions, random_fn) > random_next_threshold)
-      | tf.logical_not(valid_preceding_mask))
-  gather_indices = tf.compat.v1.where(should_get_random_or_succeeding,
-                                      random_or_succeeding_nd, all_preceding_nd)
-  return (tf.gather_nd(sentences,
-                       gather_indices), should_get_random_or_succeeding)
-
-
-def get_next_sentence_labels(sentences,
-                             random_threshold=0.5,
-                             random_fn=tf.random.uniform):
-  """Extracts the next sentence label from sentences.
-
-  Args:
-    sentences: A `RaggedTensor` of strings w/ shape [batch, (num_sentences)].
-    random_threshold: (optional) A float threshold between 0 and 1, used to
-      determine whether to extract a random sentence or the immediate next
-      sentence. Higher value favors next sentence.
-    random_fn: (optional) An op used to generate random float values.
-
-  Returns:
-    A tuple of (next_sentence_or_random, is_next_sentence) where:
-
-    next_sentence_or_random:  A `Tensor` with shape [num_sentences] that
-      contains either the subsequent sentence of `segment_a` or a randomly
-      injected sentence.
-    is_next_sentence: A `Tensor` of bool w/ shape [num_sentences]
-      that contains whether or not `next_sentence_or_random` is truly a
-      subsequent sentence or not.
-  """
-  # shift everyone to get the next sentence predictions positions
-  positions = tf.ragged.range(sentences.row_lengths())
-
-  # Shift every position down to the right.
-  next_sentences_pos = (positions + 1) % tf.expand_dims(sentences.row_lengths(),
-                                                        1)
-  rows_broadcasted = tf.expand_dims(tf.range(sentences.nrows()),
-                                    -1) + 0 * positions
-  next_sentences_pos_nd = tf.stack([rows_broadcasted, next_sentences_pos], -1)
-  all_random_nd = _sample_from_other_batch(positions, random_fn)
-
-  # Mark the items that don't have a next sentence (e.g. the last
-  # sentences in the document). We will patch these up and force them to grab a
-  # random sentence from a random document.
-  valid_next_sentences = tf.cast(
-      tf.concat([
-          tf.ones_like(positions)[:, :-1],
-          tf.zeros([positions.nrows(), 1], dtype=tf.int64)
-      ], -1), tf.bool)
-
-  is_random = ((_get_random(positions, random_fn) > random_threshold)
-               | tf.logical_not(valid_next_sentences))
-  gather_indices = tf.compat.v1.where(is_random, all_random_nd,
-                                      next_sentences_pos_nd)
-  return tf.gather_nd(sentences, gather_indices), tf.logical_not(is_random)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/segment_extractor_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/segment_extractor_test.py
deleted file mode 100644
index b6164c02bfe55b7714220d35732ffa2f00f75e65..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/modeling/ops/segment_extractor_test.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# encoding=utf-8
-"""Tests for sentence prediction labels."""
-import functools
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.nlp.modeling.ops import segment_extractor
-
-
-class NextSentencePredictionTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters([
-      dict(
-          test_description="all random",
-          sentences=[[b"Hello there.", b"La la la.", b"Such is life."],
-                     [b"Who let the dogs out?", b"Who?."]],
-          expected_segment=[[
-              b"Who let the dogs out?", b"Who?.", b"Who let the dogs out?"
-          ], [b"Hello there.", b"Hello there."]],
-          expected_labels=[
-              [False, False, False],
-              [False, False],
-          ],
-          random_threshold=0.0,
-      ),
-      dict(
-          test_description="all next",
-          sentences=[[b"Hello there.", b"La la la.", b"Such is life."],
-                     [b"Who let the dogs out?", b"Who?."]],
-          expected_segment=[
-              [b"La la la.", b"Such is life.", b"Who let the dogs out?"],
-              [b"Who?.", b"Hello there."],
-          ],
-          expected_labels=[
-              [True, True, False],
-              [True, False],
-          ],
-          random_threshold=1.0,
-      ),
-  ])
-  def testNextSentencePrediction(self,
-                                 sentences,
-                                 expected_segment,
-                                 expected_labels,
-                                 random_threshold=0.5,
-                                 test_description=""):
-    sentences = tf.ragged.constant(sentences)
-    # Set seed and rig the shuffle function to a deterministic reverse function
-    # instead. This is so that we have consistent and deterministic results.
-    extracted_segment, actual_labels = (
-        segment_extractor.get_next_sentence_labels(
-            sentences,
-            random_threshold,
-            random_fn=functools.partial(
-                tf.random.stateless_uniform, seed=(2, 3))))
-    self.assertAllEqual(expected_segment, extracted_segment)
-    self.assertAllEqual(expected_labels, actual_labels)
-
-
-class SentenceOrderLabelsTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters([
-      dict(
-          test_description="all random",
-          sentences=[[b"Hello there.", b"La la la.", b"Such is life."],
-                     [b"Who let the dogs out?", b"Who?."]],
-          expected_segment=[[
-              b"Who let the dogs out?", b"Who?.", b"Who let the dogs out?"
-          ], [b"Hello there.", b"Hello there."]],
-          expected_labels=[[True, True, True], [True, True]],
-          random_threshold=0.0,
-          random_next_threshold=0.0,
-      ),
-      dict(
-          test_description="all next",
-          sentences=[[b"Hello there.", b"La la la.", b"Such is life."],
-                     [b"Who let the dogs out?", b"Who?."]],
-          expected_segment=[[
-              b"La la la.", b"Such is life.", b"Who let the dogs out?"
-          ], [b"Who?.", b"Hello there."]],
-          expected_labels=[[True, True, True], [True, True]],
-          random_threshold=1.0,
-          random_next_threshold=0.0,
-      ),
-      dict(
-          test_description="all preceeding",
-          sentences=[[b"Hello there.", b"La la la.", b"Such is life."],
-                     [b"Who let the dogs out?", b"Who?."]],
-          expected_segment=[
-              [b"La la la.", b"Hello there.", b"Hello there."],
-              [b"Who?.", b"Who let the dogs out?"],
-          ],
-          expected_labels=[
-              [True, False, False],
-              [True, False],
-          ],
-          random_threshold=1.0,
-          random_next_threshold=1.0,
-      ),
-  ])
-  def testSentenceOrderPrediction(self,
-                                  sentences,
-                                  expected_segment,
-                                  expected_labels,
-                                  random_threshold=0.5,
-                                  random_next_threshold=0.5,
-                                  test_description=""):
-    sentences = tf.ragged.constant(sentences)
-    # Set seed and rig the shuffle function to a deterministic reverse function
-    # instead. This is so that we have consistent and deterministic results.
-    extracted_segment, actual_labels = (
-        segment_extractor.get_sentence_order_labels(
-            sentences,
-            random_threshold=random_threshold,
-            random_next_threshold=random_next_threshold,
-            random_fn=functools.partial(
-                tf.random.stateless_uniform, seed=(2, 3))))
-    self.assertAllEqual(expected_segment, extracted_segment)
-    self.assertAllEqual(expected_labels, actual_labels)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/optimization.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/optimization.py
deleted file mode 100644
index 6e7b16d8fa95b3cb114fb86e8cab35df9692dda8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/optimization.py
+++ /dev/null
@@ -1,246 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Functions and classes related to optimization (weight updates)."""
-
-import re
-
-from absl import logging
-import gin
-import tensorflow as tf
-import tensorflow_addons.optimizers as tfa_optimizers
-
-
-class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Applies a warmup schedule on a given learning rate decay schedule."""
-
-  def __init__(self,
-               initial_learning_rate,
-               decay_schedule_fn,
-               warmup_steps,
-               power=1.0,
-               name=None):
-    super(WarmUp, self).__init__()
-    self.initial_learning_rate = initial_learning_rate
-    self.warmup_steps = warmup_steps
-    self.power = power
-    self.decay_schedule_fn = decay_schedule_fn
-    self.name = name
-
-  def __call__(self, step):
-    with tf.name_scope(self.name or 'WarmUp') as name:
-      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
-      # learning rate will be `global_step/num_warmup_steps * init_lr`.
-      global_step_float = tf.cast(step, tf.float32)
-      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
-      warmup_percent_done = global_step_float / warmup_steps_float
-      warmup_learning_rate = (
-          self.initial_learning_rate *
-          tf.math.pow(warmup_percent_done, self.power))
-      return tf.cond(
-          global_step_float < warmup_steps_float,
-          lambda: warmup_learning_rate,
-          lambda: self.decay_schedule_fn(step),
-          name=name)
-
-  def get_config(self):
-    return {
-        'initial_learning_rate': self.initial_learning_rate,
-        'decay_schedule_fn': self.decay_schedule_fn,
-        'warmup_steps': self.warmup_steps,
-        'power': self.power,
-        'name': self.name
-    }
-
-
-@gin.configurable
-def create_optimizer(init_lr,
-                     num_train_steps,
-                     num_warmup_steps,
-                     end_lr=0.0,
-                     optimizer_type='adamw',
-                     beta_1=0.9):
-  """Creates an optimizer with learning rate schedule."""
-  # Implements linear decay of the learning rate.
-  lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
-      initial_learning_rate=init_lr,
-      decay_steps=num_train_steps,
-      end_learning_rate=end_lr)
-  if num_warmup_steps:
-    lr_schedule = WarmUp(
-        initial_learning_rate=init_lr,
-        decay_schedule_fn=lr_schedule,
-        warmup_steps=num_warmup_steps)
-
-  if optimizer_type == 'adamw':
-    logging.info('using Adamw optimizer')
-    optimizer = AdamWeightDecay(
-        learning_rate=lr_schedule,
-        weight_decay_rate=0.01,
-        beta_1=beta_1,
-        beta_2=0.999,
-        epsilon=1e-6,
-        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
-  elif optimizer_type == 'lamb':
-    logging.info('using Lamb optimizer')
-    optimizer = tfa_optimizers.LAMB(
-        learning_rate=lr_schedule,
-        weight_decay_rate=0.01,
-        beta_1=beta_1,
-        beta_2=0.999,
-        epsilon=1e-6,
-        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
-  else:
-    raise ValueError('Unsupported optimizer type: ', optimizer_type)
-
-  return optimizer
-
-
-class AdamWeightDecay(tf.keras.optimizers.Adam):
-  """Adam enables L2 weight decay and clip_by_global_norm on gradients.
-
-  Just adding the square of the weights to the loss function is *not* the
-  correct way of using L2 regularization/weight decay with Adam, since that will
-  interact with the m and v parameters in strange ways.
-
-  Instead we want to decay the weights in a manner that doesn't interact with
-  the m/v parameters. This is equivalent to adding the square of the weights to
-  the loss with plain (non-momentum) SGD.
-  """
-
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               amsgrad=False,
-               weight_decay_rate=0.0,
-               include_in_weight_decay=None,
-               exclude_from_weight_decay=None,
-               gradient_clip_norm=1.0,
-               name='AdamWeightDecay',
-               **kwargs):
-    super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2,
-                                          epsilon, amsgrad, name, **kwargs)
-    self.weight_decay_rate = weight_decay_rate
-    self.gradient_clip_norm = gradient_clip_norm
-    self._include_in_weight_decay = include_in_weight_decay
-    self._exclude_from_weight_decay = exclude_from_weight_decay
-    logging.info('gradient_clip_norm=%f', gradient_clip_norm)
-
-  @classmethod
-  def from_config(cls, config):
-    """Creates an optimizer from its config with WarmUp custom object."""
-    custom_objects = {'WarmUp': WarmUp}
-    return super(AdamWeightDecay, cls).from_config(
-        config, custom_objects=custom_objects)
-
-  def _prepare_local(self, var_device, var_dtype, apply_state):
-    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
-                                                apply_state)
-    apply_state[(var_device, var_dtype)]['weight_decay_rate'] = tf.constant(
-        self.weight_decay_rate, name='adam_weight_decay_rate')
-
-  def _decay_weights_op(self, var, learning_rate, apply_state):
-    do_decay = self._do_use_weight_decay(var.name)
-    if do_decay:
-      return var.assign_sub(
-          learning_rate * var *
-          apply_state[(var.device, var.dtype.base_dtype)]['weight_decay_rate'],
-          use_locking=self._use_locking)
-    return tf.no_op()
-
-  def apply_gradients(self,
-                      grads_and_vars,
-                      name=None,
-                      experimental_aggregate_gradients=True):
-    grads, tvars = list(zip(*grads_and_vars))
-    if experimental_aggregate_gradients and self.gradient_clip_norm > 0.0:
-      # when experimental_aggregate_gradients = False, apply_gradients() no
-      # longer implicitly allreduce gradients, users manually allreduce gradient
-      # and passed the allreduced grads_and_vars. For now, the
-      # clip_by_global_norm will be moved to before the explicit allreduce to
-      # keep the math the same as TF 1 and pre TF 2.2 implementation.
-      (grads, _) = tf.clip_by_global_norm(
-          grads, clip_norm=self.gradient_clip_norm)
-    return super(AdamWeightDecay, self).apply_gradients(
-        zip(grads, tvars),
-        name=name,
-        experimental_aggregate_gradients=experimental_aggregate_gradients)
-
-  def _get_lr(self, var_device, var_dtype, apply_state):
-    """Retrieves the learning rate with the given state."""
-    if apply_state is None:
-      return self._decayed_lr_t[var_dtype], {}
-
-    apply_state = apply_state or {}
-    coefficients = apply_state.get((var_device, var_dtype))
-    if coefficients is None:
-      coefficients = self._fallback_apply_state(var_device, var_dtype)
-      apply_state[(var_device, var_dtype)] = coefficients
-
-    return coefficients['lr_t'], dict(apply_state=apply_state)
-
-  def _resource_apply_dense(self, grad, var, apply_state=None):
-    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
-    decay = self._decay_weights_op(var, lr_t, apply_state)
-    with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay,
-                   self)._resource_apply_dense(grad, var, **kwargs)
-
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
-    decay = self._decay_weights_op(var, lr_t, apply_state)
-    with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay,
-                   self)._resource_apply_sparse(grad, var, indices, **kwargs)
-
-  def get_config(self):
-    config = super(AdamWeightDecay, self).get_config()
-    config.update({
-        'weight_decay_rate': self.weight_decay_rate,
-    })
-    return config
-
-  def _do_use_weight_decay(self, param_name):
-    """Whether to use L2 weight decay for `param_name`."""
-    if self.weight_decay_rate == 0:
-      return False
-
-    if self._include_in_weight_decay:
-      for r in self._include_in_weight_decay:
-        if re.search(r, param_name) is not None:
-          return True
-
-    if self._exclude_from_weight_decay:
-      for r in self._exclude_from_weight_decay:
-        if re.search(r, param_name) is not None:
-          return False
-    return True
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/encoder.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/encoder.py
deleted file mode 100644
index e36faa7102598c10066568be365dc838ddc85a0e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/encoder.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Transformer-based text encoder network."""
-# pylint: disable=g-classes-have-attributes
-
-import tensorflow as tf
-
-from official.modeling import activations
-from official.nlp import keras_nlp
-from official.nlp.modeling import layers
-from official.nlp.projects.bigbird import recompute_grad
-from official.nlp.projects.bigbird import recomputing_dropout
-
-
-_MAX_SEQ_LEN = 4096
-
-
-class RecomputeTransformerLayer(layers.TransformerScaffold):
-  """Transformer layer that recomputes the forward pass during backpropagation."""
-
-  def call(self, inputs, training=None):
-    emb, mask = inputs
-    def f(*args):
-      # recompute_grad can only handle tensor inputs. so we enumerate the
-      # nested input [emb, mask] as follows:
-      # args[0]: emb
-      # args[1]: mask[0] = band_mask
-      # args[2]: mask[1] = encoder_from_mask
-      # args[3]: mask[2] = encoder_to_mask
-      # args[4]: mask[3] = blocked_encoder_mask
-      x = super(RecomputeTransformerLayer,
-                self).call([args[0], [args[1], args[2], args[3], args[4]]],
-                           training=training)
-      return x
-
-    f = recompute_grad.recompute_grad(f)
-
-    return f(emb, *mask)
-
-
-@tf.keras.utils.register_keras_serializable(package='Text')
-class BigBirdEncoder(tf.keras.Model):
-  """Transformer-based encoder network with BigBird attentions.
-
-  *Note* that the network is constructed by
-  [Keras Functional API](https://keras.io/guides/functional_api/).
-
-  Args:
-    vocab_size: The size of the token vocabulary.
-    hidden_size: The size of the transformer hidden layers.
-    num_layers: The number of transformer layers.
-    num_attention_heads: The number of attention heads for each transformer. The
-      hidden size must be divisible by the number of attention heads.
-    max_position_embeddings: The maximum length of position embeddings that this
-      encoder can consume. If None, max_position_embeddings uses the value from
-      sequence length. This determines the variable shape for positional
-      embeddings.
-    type_vocab_size: The number of types that the 'type_ids' input can take.
-    intermediate_size: The intermediate size for the transformer layers.
-    activation: The activation to use for the transformer layers.
-    dropout_rate: The dropout rate to use for the transformer layers.
-    attention_dropout_rate: The dropout rate to use for the attention layers
-      within the transformer layers.
-    initializer: The initialzer to use for all weights in this encoder.
-    embedding_width: The width of the word embeddings. If the embedding width is
-      not equal to hidden size, embedding parameters will be factorized into two
-      matrices in the shape of ['vocab_size', 'embedding_width'] and
-      ['embedding_width', 'hidden_size'] ('embedding_width' is usually much
-      smaller than 'hidden_size').
-    use_gradient_checkpointing: Use gradient checkpointing to trade-off compute
-      for memory.
-  """
-
-  def __init__(self,
-               vocab_size,
-               hidden_size=768,
-               num_layers=12,
-               num_attention_heads=12,
-               max_position_embeddings=_MAX_SEQ_LEN,
-               type_vocab_size=16,
-               intermediate_size=3072,
-               block_size=64,
-               num_rand_blocks=3,
-               activation=activations.gelu,
-               dropout_rate=0.1,
-               attention_dropout_rate=0.1,
-               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-               embedding_width=None,
-               use_gradient_checkpointing=False,
-               **kwargs):
-    activation = tf.keras.activations.get(activation)
-    initializer = tf.keras.initializers.get(initializer)
-
-    if use_gradient_checkpointing:
-      tf.keras.layers.Dropout = recomputing_dropout.RecomputingDropout
-      layer_cls = RecomputeTransformerLayer
-    else:
-      layer_cls = layers.TransformerScaffold
-
-    self._self_setattr_tracking = False
-    self._config_dict = {
-        'vocab_size': vocab_size,
-        'hidden_size': hidden_size,
-        'num_layers': num_layers,
-        'num_attention_heads': num_attention_heads,
-        'max_position_embeddings': max_position_embeddings,
-        'type_vocab_size': type_vocab_size,
-        'intermediate_size': intermediate_size,
-        'block_size': block_size,
-        'num_rand_blocks': num_rand_blocks,
-        'activation': tf.keras.activations.serialize(activation),
-        'dropout_rate': dropout_rate,
-        'attention_dropout_rate': attention_dropout_rate,
-        'initializer': tf.keras.initializers.serialize(initializer),
-        'embedding_width': embedding_width,
-    }
-
-    word_ids = tf.keras.layers.Input(
-        shape=(None,), dtype=tf.int32, name='input_word_ids')
-    mask = tf.keras.layers.Input(
-        shape=(None,), dtype=tf.int32, name='input_mask')
-    type_ids = tf.keras.layers.Input(
-        shape=(None,), dtype=tf.int32, name='input_type_ids')
-
-    if embedding_width is None:
-      embedding_width = hidden_size
-    self._embedding_layer = keras_nlp.layers.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=embedding_width,
-        initializer=initializer,
-        name='word_embeddings')
-    word_embeddings = self._embedding_layer(word_ids)
-
-    # Always uses dynamic slicing for simplicity.
-    self._position_embedding_layer = keras_nlp.layers.PositionEmbedding(
-        initializer=initializer,
-        max_length=max_position_embeddings,
-        name='position_embedding')
-    position_embeddings = self._position_embedding_layer(word_embeddings)
-    self._type_embedding_layer = keras_nlp.layers.OnDeviceEmbedding(
-        vocab_size=type_vocab_size,
-        embedding_width=embedding_width,
-        initializer=initializer,
-        use_one_hot=True,
-        name='type_embeddings')
-    type_embeddings = self._type_embedding_layer(type_ids)
-
-    embeddings = tf.keras.layers.Add()(
-        [word_embeddings, position_embeddings, type_embeddings])
-
-    self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
-        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
-
-    embeddings = self._embedding_norm_layer(embeddings)
-    embeddings = tf.keras.layers.Dropout(rate=dropout_rate)(embeddings)
-
-    # We project the 'embedding' output to 'hidden_size' if it is not already
-    # 'hidden_size'.
-    if embedding_width != hidden_size:
-      self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
-          '...x,xy->...y',
-          output_shape=hidden_size,
-          bias_axes='y',
-          kernel_initializer=initializer,
-          name='embedding_projection')
-      embeddings = self._embedding_projection(embeddings)
-
-    self._transformer_layers = []
-    data = embeddings
-    masks = layers.BigBirdMasks(block_size=block_size)(
-        data, mask)
-    encoder_outputs = []
-    attn_head_dim = hidden_size // num_attention_heads
-    for i in range(num_layers):
-      layer = layer_cls(
-          num_attention_heads,
-          intermediate_size,
-          activation,
-          attention_cls=layers.BigBirdAttention,
-          attention_cfg=dict(
-              num_heads=num_attention_heads,
-              key_dim=attn_head_dim,
-              kernel_initializer=initializer,
-              from_block_size=block_size,
-              to_block_size=block_size,
-              num_rand_blocks=num_rand_blocks,
-              max_rand_mask_length=max_position_embeddings,
-              seed=i),
-          dropout_rate=dropout_rate,
-          attention_dropout_rate=dropout_rate,
-          kernel_initializer=initializer)
-      self._transformer_layers.append(layer)
-      data = layer([data, masks])
-      encoder_outputs.append(data)
-
-    outputs = dict(
-        sequence_output=encoder_outputs[-1], encoder_outputs=encoder_outputs)
-    super().__init__(
-        inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
-
-  def get_embedding_table(self):
-    return self._embedding_layer.embeddings
-
-  def get_embedding_layer(self):
-    return self._embedding_layer
-
-  def get_config(self):
-    return self._config_dict
-
-  @property
-  def transformer_layers(self):
-    """List of Transformer layers in the encoder."""
-    return self._transformer_layers
-
-  @property
-  def pooler_layer(self):
-    """The pooler dense layer after the transformer layers."""
-    return self._pooler_layer
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/encoder_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/encoder_test.py
deleted file mode 100644
index 93ed5ee506a2cc11a4659e776a5ef9c6ece4fb0b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/encoder_test.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.projects.bigbird.encoder."""
-
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.projects.bigbird import encoder
-
-
-class BigBirdEncoderTest(tf.test.TestCase):
-
-  def test_encoder(self):
-    sequence_length = 1024
-    batch_size = 2
-    vocab_size = 1024
-    network = encoder.BigBirdEncoder(
-        num_layers=1, vocab_size=1024, max_position_embeddings=4096)
-    word_id_data = np.random.randint(
-        vocab_size, size=(batch_size, sequence_length))
-    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
-    type_id_data = np.random.randint(2, size=(batch_size, sequence_length))
-    outputs = network([word_id_data, mask_data, type_id_data])
-    self.assertEqual(outputs["sequence_output"].shape,
-                     (batch_size, sequence_length, 768))
-
-  def test_save_restore(self):
-    sequence_length = 1024
-    batch_size = 2
-    vocab_size = 1024
-    network = encoder.BigBirdEncoder(
-        num_layers=1, vocab_size=1024, max_position_embeddings=4096)
-    word_id_data = np.random.randint(
-        vocab_size, size=(batch_size, sequence_length))
-    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
-    type_id_data = np.random.randint(2, size=(batch_size, sequence_length))
-    inputs = dict(
-        input_word_ids=word_id_data,
-        input_mask=mask_data,
-        input_type_ids=type_id_data)
-    ref_outputs = network(inputs)
-    model_path = self.get_temp_dir() + "/model"
-    network.save(model_path)
-    loaded = tf.keras.models.load_model(model_path)
-    outputs = loaded(inputs)
-    self.assertAllClose(outputs["sequence_output"],
-                        ref_outputs["sequence_output"])
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/recompute_grad.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/recompute_grad.py
deleted file mode 100644
index e9054991bd8ef13f5b2ac03c8001cb1e9743d2ae..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/recompute_grad.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Library for rematerialization.
-
-Incubates a version of tf.recompute_grad that is XLA compatible.
-"""
-import collections
-import os
-import threading
-from typing import Deque, List, NamedTuple, Optional, Sequence
-
-from absl import logging
-import numpy as np
-import tensorflow as tf
-
-
-class RecomputeContext(
-    NamedTuple('RecomputeContext', [
-        ('is_recomputing', bool),
-        ('seed', tf.Tensor),
-        ('children', Deque['RecomputeContext']),
-    ])):
-  """Context for recomputation.
-
-  Attributes:
-    is_recomputing: Whether we are in a recomputation phase.
-    seed: Scalar integer tensor that should be used with stateless random ops
-      for deterministic behavior and correct computation of the gradient.
-    children: Nested `RecomputeContext` instances. Used internally by
-      `recompute_grad` to track nested instances of `RecomputeContext`.
-  """
-
-  def __enter__(self):
-    return _context_stack.push(self)
-
-  def __exit__(self, exc_type, exc_value, traceback):
-    _context_stack.pop(self)
-
-
-# Simplified version of `_DefaultStack` in
-# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/ops.py.
-class _ContextStack(threading.local):
-  """A thread-local stack for providing implicit recompute contexts."""
-
-  def __init__(self):
-    super(_ContextStack, self).__init__()
-    self._stack = []
-
-  def top(self) -> Optional[RecomputeContext]:
-    return self._stack[-1] if self._stack else None
-
-  def push(self, context: RecomputeContext):
-    self._stack.append(context)
-    return context
-
-  def pop(self, context: RecomputeContext):
-    if self._stack[-1] is not context:
-      raise AssertionError('Nesting violated for RecomputeContext.')
-    self._stack.pop()
-
-
-_context_stack = _ContextStack()
-
-
-def get_recompute_context() -> Optional[RecomputeContext]:
-  """Returns the current recomputing context if it exists."""
-  return _context_stack.top()
-
-
-# Adapted from
-# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/control_flow_util.py.
-def _get_containing_xla_context(graph: tf.Graph) -> Optional[object]:
-  """Returns the first ancestor `XLAControlFlowContext` in the `graph`."""
-  ctxt = graph._get_control_flow_context()  # pylint: disable=protected-access
-  while ctxt:
-    if ctxt.IsXLAContext():
-      return ctxt
-    ctxt = ctxt.outer_context
-  return None
-
-
-def _in_xla_context(graph: Optional[tf.Graph] = None) -> bool:
-  """Detects whether we are in an XLA context."""
-  if '--tf_xla_auto_jit=2' in os.environ.get('TF_XLA_FLAGS', ''):
-    return True
-  graph = tf.compat.v1.get_default_graph() if graph is None else graph
-  while True:
-    if _get_containing_xla_context(graph) is not None:
-      return True
-    try:
-      graph = graph.outer_graph
-    except AttributeError:
-      return False
-
-
-def _force_data_dependency(
-    first_compute: Sequence[tf.Tensor],
-    then_compute: Sequence[tf.Tensor]) -> List[tf.Tensor]:
-  """Force all of `then_compute` to depend on all of `first_compute`.
-
-  Uses a dummy data dependency, which is useful when running on TPUs because
-  XLA ignores control dependencies. Only supports float arguments.
-
-  Args:
-    first_compute: Sequence of `Tensor`s to be executed before `then_compute`.
-    then_compute: Sequence of `Tensor`s to executed after `first_compute`.
-
-  Returns:
-    Sequence of `Tensor`s with same length of `then_compute`.
-
-  Raises:
-    ValueError: if ranks are unknown or types are not floating.
-  """
-
-  def _first_element(x):
-    if x.shape.ndims is None:
-      raise ValueError('Rank of Tensor %s must be known' % x)
-    ndims = x.shape.ndims
-    begin = tf.zeros(ndims, dtype=tf.int32)
-    size = tf.ones(ndims, dtype=tf.int32)
-    return tf.reshape(tf.slice(x, begin, size), [])
-
-  first_compute_sum = tf.add_n(
-      [_first_element(x) for x in first_compute if x is not None])
-  dtype = first_compute_sum.dtype
-  if not dtype.is_floating:
-    raise ValueError('_force_data_dependency only supports floating dtypes.')
-  zero = np.finfo(dtype.as_numpy_dtype).tiny * first_compute_sum
-  return [
-      x + tf.cast(zero, x.dtype) if x is not None else None
-      for x in then_compute
-  ]
-
-
-def _make_seed_if_none(seed: Optional[tf.Tensor]) -> tf.Tensor:
-  """Uses the global generator to make a seed if necessary."""
-  if seed is not None:
-    return seed
-  generator = tf.random.experimental.get_global_generator()
-  # The two seeds for stateless random ops don't have individual semantics and
-  # are scrambled together, so providing one seed is fine. This makes it easier
-  # for users to provide a local seed without worrying about integer overflow.
-  # See `make_seeds` in
-  # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/stateful_random_ops.py.
-  try:
-    return generator.uniform_full_int([], tf.int32, name='recompute_grad_seed')
-  except (RuntimeError, TypeError, ValueError, tf.errors.NotFoundError) as e:
-    # For a number of reasons, the above operation can fail like using multiple
-    # graphs or toggling between eager and graph modes. Reset the generator.
-    logging.warn('Resetting the generator. %s: %s', type(e), e)
-    tf.random.experimental.set_global_generator(None)
-    generator = tf.random.experimental.get_global_generator()
-    return generator.uniform_full_int([], tf.int32, name='recompute_grad_seed')
-
-
-def recompute_grad(f, seed=None):
-  """An eager-compatible version of recompute_grad.
-
-  For f(*args, **kwargs), this supports gradients with respect to args, or to
-  gradients with respect to any variables residing in the kwarg 'variables'.
-  Note that for keras layer and model objects, this is handled automatically.
-
-  Warning: If `f` was originally a tf.keras Model or Layer object, `g` will not
-  be able to access the member variables of that object, because `g` returns
-  through the wrapper function `inner`.  When recomputing gradients through
-  objects that inherit from keras, we suggest keeping a reference to the
-  underlying object around for the purpose of accessing these variables.
-
-  Args:
-    f: function `f(*x)` that returns a `Tensor` or sequence of `Tensor` outputs.
-    seed: Optional seed for random ops. `seed` should an integer scalar
-      `Tensor`. When compiling to XLA, `seed` must have dtype `tf.int32`. If
-      `seed` is not provided one will be generated.
-
-  Returns:
-   A function `g` that wraps `f`, but which recomputes `f` on the backwards
-   pass of a gradient call.
-  """
-
-  @tf.custom_gradient
-  def inner(*args, **kwargs):
-    """Inner function closure for calculating gradients."""
-    # Detect when we're nested and in the backwards pass, so we don't generate
-    # an additional seed.
-    parent_context = get_recompute_context()
-    if parent_context is not None and parent_context.is_recomputing:
-      # Use the cached context in the recomputation phase.
-      with parent_context.children.popleft()._replace(
-          is_recomputing=True) as context:
-        result = f(*args, **kwargs)
-    else:
-      with RecomputeContext(
-          is_recomputing=False,
-          seed=_make_seed_if_none(seed),
-          children=collections.deque()) as context:
-        result = f(*args, **kwargs)
-        # In the forward pass, build up a tree of recomputation contexts.
-        if parent_context is not None and not parent_context.is_recomputing:
-          parent_context.children.append(context)
-
-    def grad(*dresult, **grad_kwargs):
-      """Gradient function calculation for inner function."""
-      variables = grad_kwargs.pop('variables', None)
-      if grad_kwargs:
-        raise ValueError('Found unexpected kwargs for `grad`: ',
-                         list(grad_kwargs.keys()))
-      inputs, seed = list(args), context.seed
-      if _in_xla_context():
-        inputs = _force_data_dependency(
-            tf.nest.flatten(dresult), inputs + [seed])
-        seed = inputs.pop()
-      with tf.GradientTape() as tape:
-        tape.watch(inputs)
-        if variables is not None:
-          tape.watch(variables)
-        with tf.control_dependencies(dresult):
-          with context._replace(is_recomputing=True, seed=seed):
-            result = f(*inputs, **kwargs)
-      kw_vars = []
-      if variables is not None:
-        kw_vars = list(variables)
-      grads = tape.gradient(
-          result, list(inputs) + kw_vars, output_gradients=dresult)
-      return grads[:len(inputs)], grads[len(inputs):]
-
-    return result, grad
-
-  return inner
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/recomputing_dropout.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/recomputing_dropout.py
deleted file mode 100644
index 81428f4cd10b28fe90d3a6cbb9fb9c3fdf867a76..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/recomputing_dropout.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Keras dropout layer that is aware of `RecomputeContext`."""
-
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.projects.bigbird import recompute_grad as recompute_grad_lib
-from official.nlp.projects.bigbird import stateless_dropout as stateless_dropout_lib
-
-
-# Reimplements internal function
-# https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/framework/smart_cond.py.
-def smart_cond(pred, true_fn=None, false_fn=None, name=None):
-  """Return either `true_fn()` if predicate `pred` is true else `false_fn()`.
-
-  If `pred` is a bool or has a constant value, we return either `true_fn()`
-  or `false_fn()`, otherwise we use `tf.cond` to dynamically route to both.
-
-  Arguments:
-    pred: A scalar determining whether to return the result of `true_fn` or
-      `false_fn`.
-    true_fn: The callable to be performed if pred is true.
-    false_fn: The callable to be performed if pred is false.
-    name: Optional name prefix when using `tf.cond`.
-
-  Returns:
-    Tensors returned by the call to either `true_fn` or `false_fn`.
-
-  Raises:
-    TypeError: If `true_fn` or `false_fn` is not callable.
-  """
-  if not callable(true_fn):
-    raise TypeError('`true_fn` must be callable.')
-  if not callable(false_fn):
-    raise TypeError('`false_fn` must be callable.')
-  pred_value = tf.get_static_value(pred)
-  if isinstance(pred, tf.Variable) or pred_value is None:
-    return tf.cond(
-        pred, true_fn=true_fn, false_fn=false_fn, name=name)
-  if pred_value:
-    return true_fn()
-  else:
-    return false_fn()
-
-
-# See https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dropout.
-class RecomputingDropout(tf.keras.layers.Layer):
-  """`tf.keras.layers.Dropout` that supports `recompute_grad`."""
-
-  def __init__(self,
-               rate,
-               noise_shape=None,
-               seed=None,
-               force_recomputation=False,
-               **kwargs):
-    """Initializes `RecomputingDropout`.
-
-    Args:
-      rate: Float between 0 and 1. Fraction of the input units to drop.
-      noise_shape: 1D integer tensor representing the shape of the binary
-        dropout mask that will be multiplied with the input. For instance, if
-        inputs have shape `(batch_size, timesteps, features)` and you want the
-        dropout mask to be the same for all timesteps, you can use
-        `noise_shape=(batch_size, 1, features)`.
-      seed: A Python integer to use as random seed.
-      force_recomputation: If `True`, then raises an error if called outside a
-        recompute context.
-      **kwargs: Keyword arguments for `tf.keras.layers.Layer`.
-    """
-
-    super(RecomputingDropout, self).__init__(**kwargs)
-    self.rate = rate
-    self.noise_shape = noise_shape
-    self.seed = seed
-    self.force_recomputation = force_recomputation
-    self.supports_masking = True
-    # Create a layer-specific seed to combine with the global recompute seed.
-    self._recompute_seed = (
-        np.random.randint(-2**31, 2**31, dtype=np.int32)
-        if seed is None else seed)
-
-  def _get_noise_shape(self, inputs):
-    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
-    # which will override `self.noise_shape`, and allows for custom noise
-    # shapes with dynamically sized inputs.
-    if self.noise_shape is None:
-      return None
-
-    concrete_inputs_shape = tf.shape(inputs)
-    noise_shape = []
-    for i, value in enumerate(self.noise_shape):
-      noise_shape.append(concrete_inputs_shape[i] if value is None else value)
-    return tf.convert_to_tensor(noise_shape)
-
-  def call(self, inputs, training=None):
-    """Builds computation graph.
-
-    Args:
-      inputs: Input tensor (of any rank).
-      training: Python boolean indicating whether the layer should behave in
-        training mode (adding dropout) or in inference mode (doing nothing).
-
-    Returns:
-      `inputs` masked according to layer configuration.
-
-    Raises:
-      ValueError: If `force_recomputation` is `True` and called outside a
-        a recompute context.
-    """
-    if training is None:
-      training = tf.keras.backend.learning_phase()
-
-    def dropped_inputs():
-      """Randomly drops elements of `inputs` when `training=True`."""
-      recompute_context = recompute_grad_lib.get_recompute_context()
-      if recompute_context is None:
-        if self.force_recomputation:
-          raise ValueError(
-              'RecomputeContext is required when force_recomputation=True.')
-        return tf.nn.dropout(
-            inputs,
-            noise_shape=self._get_noise_shape(inputs),
-            seed=self.seed,
-            rate=self.rate)
-      seed = tf.stack([recompute_context.seed, self._recompute_seed])
-      return stateless_dropout_lib.stateless_dropout(
-          inputs,
-          rate=self.rate,
-          seed=seed,
-          noise_shape=self._get_noise_shape(inputs))
-
-    output = smart_cond(training, dropped_inputs, lambda: tf.identity(inputs))
-    return output
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
-
-  def get_config(self):
-    config = {
-        'rate': self.rate,
-        'noise_shape': self.noise_shape,
-        'seed': self.seed,
-        'force_recomputation': self.force_recomputation,
-    }
-    base_config = super(RecomputingDropout, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/stateless_dropout.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/stateless_dropout.py
deleted file mode 100644
index 2d1ad7c6afb0a3cdafa68e4dc234c09e41d4a880..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/bigbird/stateless_dropout.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A replacement for tf.nn.dropout that uses stateless random ops."""
-
-import numbers
-from typing import Optional, Sequence, Text, Union
-
-from absl import logging
-import tensorflow as tf
-
-
-def _as_shape(shape: Union[Sequence[int], tf.TensorShape]) -> tf.TensorShape:
-  """Converts the given object to a TensorShape."""
-  return shape if isinstance(shape, tf.TensorShape) else tf.TensorShape(shape)
-
-
-def _get_noise_shape(
-    x: tf.Tensor, noise_shape: Union[Sequence[int], tf.TensorShape]
-) -> Union[tf.Tensor, tf.TensorShape, Sequence[int]]:
-  """Computes the shape of the binary mask for dropout."""
-  # If noise_shape is none return immediately.
-  if noise_shape is None:
-    return tf.shape(x)
-
-  try:
-    # Best effort to figure out the intended shape.
-    # If not possible, let the op to handle it.
-    # In eager mode exception will show up.
-    noise_shape_ = _as_shape(noise_shape)
-  except (TypeError, ValueError):
-    return noise_shape
-
-  if x.shape.dims is not None and len(x.shape.dims) == len(noise_shape_.dims):
-    new_dims = []
-    for i, dim in enumerate(x.shape.dims):
-      if noise_shape_.dims[i].value is None and dim.value is not None:
-        new_dims.append(dim.value)
-      else:
-        new_dims.append(noise_shape_.dims[i].value)
-    return tf.TensorShape(new_dims)
-
-  return noise_shape
-
-
-def stateless_dropout(x: tf.Tensor,
-                      rate: float,
-                      seed: tf.Tensor,
-                      noise_shape: Optional[Union[Sequence[int],
-                                                  tf.TensorShape]] = None,
-                      name: Optional[Text] = None) -> tf.Tensor:
-  """Computes dropout: randomly sets elements to zero to prevent overfitting.
-
-  See https://www.tensorflow.org/api_docs/python/tf/nn/dropout.
-  This version differs in that the seed is required if the rate is nonzero.
-
-  Args:
-    x: A floating point tensor.
-    rate: A scalar `Tensor` with the same type as x. The probability that each
-      element is dropped. For example, setting rate=0.1 would drop 10% of input
-      elements.
-    seed: A shape [2] integer Tensor of seeds to the random number generator.
-      Must have dtype `tf.int32` when compiling to XLA.
-    noise_shape: A 1-D `Tensor` of type `int32`, representing the shape for
-      randomly generated keep/drop flags.
-    name: A name for this operation (optional).
-
-  Returns:
-    A `Tensor` of the same shape of `x`.
-
-  Raises:
-    ValueError: If `rate` is not in `[0, 1)` or if `x` is not a floating point
-      tensor. `rate=1` is disallowed, because the output would be all zeros,
-      which is likely not what was intended.
-  """
-  with tf.name_scope(name or 'stateless_dropout') as name:
-    x = tf.convert_to_tensor(x, name='x')
-    if not x.dtype.is_floating:
-      raise ValueError('x has to be a floating point tensor since it\'s going '
-                       ' to be scaled. Got a %s tensor instead.' % x.dtype)
-    if isinstance(rate, numbers.Real):
-      if not (rate >= 0 and rate < 1):
-        raise ValueError('rate must be a scalar tensor or a float in the '
-                         'range [0, 1), got %g' % rate)
-      if rate > 0.5:
-        logging.log_first_n(
-            logging.WARN, 'Large dropout rate: %g (>0.5). In TensorFlow '
-            '.x, dropout() uses dropout rate instead of keep_prob. '
-            'Please ensure that this is intended.', 5, rate)
-
-    # Early return if nothing needs to be dropped.
-    if tf.get_static_value(rate) == 0:
-      return x
-
-    rate = tf.convert_to_tensor(rate, dtype=x.dtype, name='rate')
-    rate.shape.assert_has_rank(0)
-    noise_shape = _get_noise_shape(x, noise_shape)
-    # Sample a uniform distribution on [0.0, 1.0) and select values larger than
-    # rate.
-    #
-    # NOTE: Random uniform actually can only generate 2^23 floats on [1.0, 2.0)
-    # and subtract 1.0.
-    random_tensor = tf.random.stateless_uniform(
-        noise_shape, seed=seed, dtype=x.dtype)
-    keep_prob = 1 - rate
-    scale = 1 / keep_prob
-    # NOTE: if (1.0 + rate) - 1 is equal to rate, then we want to consider that
-    # float to be selected, hence we use a >= comparison.
-    keep_mask = random_tensor >= rate
-    ret = x * scale * tf.cast(keep_mask, x.dtype)
-    if not tf.executing_eagerly():
-      ret.set_shape(x.get_shape())
-    return ret
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/README.md
deleted file mode 100644
index 9209b4720d622946fff697ed76e3d6c6322bd9af..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/README.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# MobileBERT (MobileBERT: A Compact Task-Agnostic BERT for Resource-Limited Devices)
-
-[MobileBERT](https://arxiv.org/abs/2004.02984)
-is a thin version of BERT_LARGE, while equipped with bottleneck
-structures and a carefully designed balance between self-attentions and
-feed-forward networks.
-
-To train MobileBERT, we first train a specially designed teacher model, an
-inverted-bottleneck incorporated BERT_LARGE model. Then, we conduct knowledge
-transfer from this teacher to MobileBERT. Empirical studies show that MobileBERT
-is 4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive
-results on well-known benchmarks. This repository contains TensorFlow 2.x
-implementation for MobileBERT.
-
-## Network Implementations
-
-Following
-[MobileBERT TF1 implementation](https://github.com/google-research/google-research/tree/master/mobilebert),
-we re-implemented MobileBERT encoder and layers using `tf.keras` APIs in NLP
-modeling library:
-
-  * [mobile_bert_encoder.py](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/networks/mobile_bert_encoder.py)
-  contains `MobileBERTEncoder` implementation.
-  * [mobile_bert_layers.py](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/mobile_bert_layers.py)
-  contains `MobileBertEmbedding`, `MobileBertMaskedLM` and `MobileBertMaskedLM`
-  implementation.
-
-## Pre-trained Models
-
-We converted the originial TF 1.x pretrained English MobileBERT checkpoint to
-TF 2.x checkpoint, which is compatible with the above implementations.
-In addition, we also provide new multiple-lingual MobileBERT checkpoint
-trained using multi-lingual Wiki data. Furthermore, we export the checkpoints to
-TF-HUB SavedModel. Please find the details in the following table:
-
-Model                          | Configuration                            | Number of Parameters | Training Data | Checkpoint & Vocabulary                                                                                                                                    | TF-Hub SavedModel                                                                                                                      | Metrics
------------------------------- | :--------------------------------------: | :------------------- | :-----------: | :-----------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------: | :-----:
-MobileBERT uncased English     | uncased_L-24_H-128_B-512_A-4_F-4_OPT     | 25.3 Million         | Wiki + Books  | [Download](https://storage.cloud.google.com/tf_model_garden/nlp/mobilebert/uncased_L-24_H-128_B-512_A-4_F-4_OPT.tar.gz)     | [TF-Hub](https://tfhub.dev/tensorflow/mobilebert_en_uncased_L-24_H-128_B-512_A-4_F-4_OPT/1)     | Squad v1.1 F1 90.0, GLUE 77.7
-MobileBERT cased Multi-lingual | multi_cased_L-24_H-128_B-512_A-4_F-4_OPT | 36 Million           | Wiki          | [Download](https://storage.cloud.google.com/tf_model_garden/nlp/mobilebert/multi_cased_L-24_H-128_B-512_A-4_F-4_OPT.tar.gz) | [TF-Hub](https://tfhub.dev/tensorflow/mobilebert_multi_cased_L-24_H-128_B-512_A-4_F-4_OPT/1) | XNLI (zero-short):64.7
-
-### Restoring from Checkpoints
-
-To load the pre-trained MobileBERT checkpoint in your code, please follow the
-example below:
-
-```python
-import tensorflow as tf
-from official.nlp.projects.mobilebert import model_utils
-
-bert_config_file = ...
-model_checkpoint_path = ...
-
-bert_config = model_utils.BertConfig.from_json_file(bert_config_file)
-
-# `pretrainer` is an instance of `nlp.modeling.models.BertPretrainerV2`.
-pretrainer = model_utils.create_mobilebert_pretrainer(bert_config)
-checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
-checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
-
-# `mobilebert_encoder` is an instance of
-# `nlp.modeling.networks.MobileBERTEncoder`.
-mobilebert_encoder = pretrainer.encoder_network
-```
-
-### Use TF-Hub models
-
-For the usage of MobileBert TF-Hub model, please see the TF-Hub site
-([English model](https://tfhub.dev/tensorflow/mobilebert_en_uncased_L-24_H-128_B-512_A-4_F-4_OPT/1)
-or
-[Multilingual model](https://tfhub.dev/tensorflow/mobilebert_multi_cased_L-24_H-128_B-512_A-4_F-4_OPT/1)).
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/distillation.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/distillation.py
deleted file mode 100644
index 3b88815a489e266a4e5c9b0aadb830f6c244e4a3..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/distillation.py
+++ /dev/null
@@ -1,606 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Progressive distillation for MobileBERT student model."""
-from typing import List, Optional
-
-from absl import logging
-import dataclasses
-import orbit
-import tensorflow as tf
-from official.core import base_task
-from official.core import config_definitions as cfg
-from official.modeling import optimization
-from official.modeling import tf_utils
-from official.modeling.hyperparams import base_config
-from official.modeling.progressive import policies
-from official.nlp import keras_nlp
-from official.nlp.configs import bert
-from official.nlp.configs import encoders
-from official.nlp.data import data_loader_factory
-from official.nlp.modeling import layers
-from official.nlp.modeling import models
-
-
-@dataclasses.dataclass
-class LayerWiseDistillConfig(base_config.Config):
-  """Defines the behavior of layerwise distillation."""
-  num_steps: int = 10000
-  warmup_steps: int = 0
-  initial_learning_rate: float = 1.5e-3
-  end_learning_rate: float = 1.5e-3
-  decay_steps: int = 10000
-  hidden_distill_factor: float = 100.0
-  beta_distill_factor: float = 5000.0
-  gamma_distill_factor: float = 5.0
-  if_transfer_attention: bool = True
-  attention_distill_factor: float = 1.0
-  if_freeze_previous_layers: bool = False
-
-  # The ids of teacher layers that will be mapped to the student model.
-  # For example, if you want to compress a 24 layer teacher to a 6 layer
-  # student, you can set it to [3, 7, 11, 15, 19, 23] (the index starts from 0).
-  # If `None`, we assume teacher and student have the same number of layers,
-  # and each layer of teacher model will be mapped to student's corresponding
-  # layer.
-  transfer_teacher_layers: Optional[List[int]] = None
-
-
-@dataclasses.dataclass
-class PretrainDistillConfig(base_config.Config):
-  """Defines the behavior of pretrain distillation."""
-  num_steps: int = 500000
-  warmup_steps: int = 10000
-  initial_learning_rate: float = 1.5e-3
-  end_learning_rate: float = 1.5e-7
-  decay_steps: int = 500000
-  if_use_nsp_loss: bool = True
-  distill_ground_truth_ratio: float = 0.5
-
-
-@dataclasses.dataclass
-class BertDistillationProgressiveConfig(policies.ProgressiveConfig):
-  """Defines the specific distillation behavior."""
-  if_copy_embeddings: bool = True
-  layer_wise_distill_config: LayerWiseDistillConfig = LayerWiseDistillConfig()
-  pretrain_distill_config: PretrainDistillConfig = PretrainDistillConfig()
-
-
-@dataclasses.dataclass
-class BertDistillationTaskConfig(cfg.TaskConfig):
-  """Defines the teacher/student model architecture and training data."""
-  teacher_model: bert.PretrainerConfig = bert.PretrainerConfig(
-      encoder=encoders.EncoderConfig(type='mobilebert'))
-
-  student_model: bert.PretrainerConfig = bert.PretrainerConfig(
-      encoder=encoders.EncoderConfig(type='mobilebert'))
-  # The path to the teacher model checkpoint or its directory.
-  teacher_model_init_checkpoint: str = ''
-  train_data: cfg.DataConfig = cfg.DataConfig()
-  validation_data: cfg.DataConfig = cfg.DataConfig()
-
-
-def build_sub_encoder(encoder, target_layer_id):
-  """Builds an encoder that only computes first few transformer layers."""
-  input_ids = encoder.inputs[0]
-  input_mask = encoder.inputs[1]
-  type_ids = encoder.inputs[2]
-  attention_mask = keras_nlp.layers.SelfAttentionMask()(
-      inputs=input_ids, to_mask=input_mask)
-  embedding_output = encoder.embedding_layer(input_ids, type_ids)
-
-  layer_output = embedding_output
-  attention_score = None
-  for layer_idx in range(target_layer_id + 1):
-    layer_output, attention_score = encoder.transformer_layers[layer_idx](
-        layer_output, attention_mask, return_attention_scores=True)
-
-  return tf.keras.Model(
-      inputs=[input_ids, input_mask, type_ids],
-      outputs=[layer_output, attention_score])
-
-
-class BertDistillationTask(policies.ProgressivePolicy, base_task.Task):
-  """Distillation language modeling task progressively."""
-
-  def __init__(self,
-               strategy,
-               progressive: BertDistillationProgressiveConfig,
-               optimizer_config: optimization.OptimizationConfig,
-               task_config: BertDistillationTaskConfig,
-               logging_dir=None):
-
-    self._strategy = strategy
-    self._task_config = task_config
-    self._progressive_config = progressive
-    self._optimizer_config = optimizer_config
-    self._train_data_config = task_config.train_data
-    self._eval_data_config = task_config.validation_data
-    self._the_only_train_dataset = None
-    self._the_only_eval_dataset = None
-
-    layer_wise_config = self._progressive_config.layer_wise_distill_config
-    transfer_teacher_layers = layer_wise_config.transfer_teacher_layers
-    num_teacher_layers = (
-        self._task_config.teacher_model.encoder.mobilebert.num_blocks)
-    num_student_layers = (
-        self._task_config.student_model.encoder.mobilebert.num_blocks)
-    if transfer_teacher_layers and len(
-        transfer_teacher_layers) != num_student_layers:
-      raise ValueError('The number of `transfer_teacher_layers` %s does not '
-                       'match the number of student layers. %d' %
-                       (transfer_teacher_layers, num_student_layers))
-    if not transfer_teacher_layers and (num_teacher_layers !=
-                                        num_student_layers):
-      raise ValueError('`transfer_teacher_layers` is not specified, and the '
-                       'number of teacher layers does not match '
-                       'the number of student layers.')
-
-    ratio = progressive.pretrain_distill_config.distill_ground_truth_ratio
-    if ratio < 0 or ratio > 1:
-      raise ValueError('distill_ground_truth_ratio has to be within [0, 1].')
-
-    # A non-trainable layer for feature normalization for transfer loss
-    self._layer_norm = tf.keras.layers.LayerNormalization(
-        axis=-1,
-        beta_initializer='zeros',
-        gamma_initializer='ones',
-        trainable=False)
-
-    # Build the teacher and student pretrainer model.
-    self._teacher_pretrainer = self._build_pretrainer(
-        self._task_config.teacher_model, name='teacher')
-    self._student_pretrainer = self._build_pretrainer(
-        self._task_config.student_model, name='student')
-
-    base_task.Task.__init__(
-        self, params=task_config, logging_dir=logging_dir)
-    policies.ProgressivePolicy.__init__(self)
-
-  def _build_pretrainer(self, pretrainer_cfg: bert.PretrainerConfig, name: str):
-    """Builds pretrainer from config and encoder."""
-    encoder = encoders.build_encoder(pretrainer_cfg.encoder)
-    if pretrainer_cfg.cls_heads:
-      cls_heads = [
-          layers.ClassificationHead(**cfg.as_dict())
-          for cfg in pretrainer_cfg.cls_heads
-      ]
-    else:
-      cls_heads = []
-
-    masked_lm = layers.MobileBertMaskedLM(
-        embedding_table=encoder.get_embedding_table(),
-        activation=tf_utils.get_activation(pretrainer_cfg.mlm_activation),
-        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=pretrainer_cfg.mlm_initializer_range),
-        name='cls/predictions')
-
-    pretrainer = models.BertPretrainerV2(
-        encoder_network=encoder,
-        classification_heads=cls_heads,
-        customized_masked_lm=masked_lm,
-        name=name)
-    return pretrainer
-
-  # override policies.ProgressivePolicy
-  def num_stages(self):
-    # One stage for each layer, plus additional stage for pre-training
-    return self._task_config.student_model.encoder.mobilebert.num_blocks + 1
-
-  # override policies.ProgressivePolicy
-  def num_steps(self, stage_id) -> int:
-    """Return the total number of steps in this stage."""
-    if stage_id + 1 < self.num_stages():
-      return self._progressive_config.layer_wise_distill_config.num_steps
-    else:
-      return self._progressive_config.pretrain_distill_config.num_steps
-
-  # override policies.ProgressivePolicy
-  def get_model(self, stage_id, old_model=None) -> tf.keras.Model:
-    del old_model
-    return self.build_model(stage_id)
-
-  # override policies.ProgressivePolicy
-  def get_optimizer(self, stage_id):
-    """Build optimizer for each stage."""
-    if stage_id + 1 < self.num_stages():
-      distill_config = self._progressive_config.layer_wise_distill_config
-    else:
-      distill_config = self._progressive_config.pretrain_distill_config
-
-    params = self._optimizer_config.replace(
-        learning_rate={
-            'polynomial': {
-                'decay_steps':
-                    distill_config.decay_steps,
-                'initial_learning_rate':
-                    distill_config.initial_learning_rate,
-                'end_learning_rate':
-                    distill_config.end_learning_rate,
-            }
-        },
-        warmup={
-            'linear':
-                {'warmup_steps':
-                     distill_config.warmup_steps,
-                }
-            })
-    opt_factory = optimization.OptimizerFactory(params)
-    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
-
-    return optimizer
-
-  # override policies.ProgressivePolicy
-  def get_train_dataset(self, stage_id: int) -> tf.data.Dataset:
-    """Return Dataset for this stage."""
-    del stage_id
-    if self._the_only_train_dataset is None:
-      self._the_only_train_dataset = orbit.utils.make_distributed_dataset(
-          self._strategy, self.build_inputs, self._train_data_config)
-    return self._the_only_train_dataset
-
-  # overrides policies.ProgressivePolicy
-  def get_eval_dataset(self, stage_id):
-    del stage_id
-    if self._the_only_eval_dataset is None:
-      self._the_only_eval_dataset = orbit.utils.make_distributed_dataset(
-          self._strategy, self.build_inputs, self._eval_data_config)
-    return self._the_only_eval_dataset
-
-  # override base_task.task
-  def build_model(self, stage_id) -> tf.keras.Model:
-    """Build teacher/student keras models with outputs for current stage."""
-    # Freeze the teacher model.
-    self._teacher_pretrainer.trainable = False
-    layer_wise_config = self._progressive_config.layer_wise_distill_config
-    freeze_previous_layers = layer_wise_config.if_freeze_previous_layers
-    student_encoder = self._student_pretrainer.encoder_network
-
-    if stage_id != self.num_stages() - 1:
-      # Build a model that outputs teacher's and student's transformer outputs.
-      inputs = student_encoder.inputs
-      student_sub_encoder = build_sub_encoder(
-          encoder=student_encoder, target_layer_id=stage_id)
-      student_output_feature, student_attention_score = student_sub_encoder(
-          inputs)
-
-      if layer_wise_config.transfer_teacher_layers:
-        teacher_layer_id = layer_wise_config.transfer_teacher_layers[stage_id]
-      else:
-        teacher_layer_id = stage_id
-
-      teacher_sub_encoder = build_sub_encoder(
-          encoder=self._teacher_pretrainer.encoder_network,
-          target_layer_id=teacher_layer_id)
-
-      teacher_output_feature, teacher_attention_score = teacher_sub_encoder(
-          inputs)
-
-      if freeze_previous_layers:
-        student_encoder.embedding_layer.trainable = False
-        for i in range(stage_id):
-          student_encoder.transformer_layers[i].trainable = False
-
-      return tf.keras.Model(
-          inputs=inputs,
-          outputs=dict(
-              student_output_feature=student_output_feature,
-              student_attention_score=student_attention_score,
-              teacher_output_feature=teacher_output_feature,
-              teacher_attention_score=teacher_attention_score))
-    else:
-      # Build a model that outputs teacher's and student's MLM/NSP outputs.
-      inputs = self._student_pretrainer.inputs
-      student_pretrainer_output = self._student_pretrainer(inputs)
-      teacher_pretrainer_output = self._teacher_pretrainer(inputs)
-
-      # Set all student's transformer blocks to trainable.
-      if freeze_previous_layers:
-        student_encoder.embedding_layer.trainable = True
-        for layer in student_encoder.transformer_layers:
-          layer.trainable = True
-
-      model = tf.keras.Model(
-          inputs=inputs,
-          outputs=dict(
-              student_pretrainer_output=student_pretrainer_output,
-              teacher_pretrainer_output=teacher_pretrainer_output,
-          ))
-      # Checkpoint the student encoder which is the goal of distillation.
-      model.checkpoint_items = self._student_pretrainer.checkpoint_items
-      return model
-
-  # overrides base_task.Task
-  def build_inputs(self, params, input_context=None):
-    """Returns tf.data.Dataset for pretraining."""
-    # copy from masked_lm.py for testing
-    if params.input_path == 'dummy':
-
-      def dummy_data(_):
-        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
-        dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32)
-        return dict(
-            input_word_ids=dummy_ids,
-            input_mask=dummy_ids,
-            input_type_ids=dummy_ids,
-            masked_lm_positions=dummy_lm,
-            masked_lm_ids=dummy_lm,
-            masked_lm_weights=tf.cast(dummy_lm, dtype=tf.float32),
-            next_sentence_labels=tf.zeros((1, 1), dtype=tf.int32))
-
-      dataset = tf.data.Dataset.range(1)
-      dataset = dataset.repeat()
-      dataset = dataset.map(
-          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-      return dataset
-
-    return data_loader_factory.get_data_loader(params).load(input_context)
-
-  def _get_distribution_losses(self, teacher, student):
-    """Return the beta and gamma distall losses for feature distribution."""
-    teacher_mean = tf.math.reduce_mean(teacher, axis=-1, keepdims=True)
-    student_mean = tf.math.reduce_mean(student, axis=-1, keepdims=True)
-    teacher_var = tf.math.reduce_variance(teacher, axis=-1, keepdims=True)
-    student_var = tf.math.reduce_variance(student, axis=-1, keepdims=True)
-
-    beta_loss = tf.math.squared_difference(student_mean, teacher_mean)
-    beta_loss = tf.math.reduce_mean(beta_loss, axis=None, keepdims=False)
-    gamma_loss = tf.math.abs(student_var - teacher_var)
-    gamma_loss = tf.math.reduce_mean(gamma_loss, axis=None, keepdims=False)
-
-    return beta_loss, gamma_loss
-
-  def _get_attention_loss(self, teacher_score, student_score):
-    # Note that the definition of KLDivergence here is a little different from
-    # the original one (tf.keras.losses.KLDivergence). We adopt this approach
-    # to stay consistent with the TF1 implementation.
-    teacher_weight = tf.keras.activations.softmax(teacher_score, axis=-1)
-    student_log_weight = tf.nn.log_softmax(student_score, axis=-1)
-    kl_divergence = -(teacher_weight * student_log_weight)
-    kl_divergence = tf.math.reduce_sum(kl_divergence, axis=-1, keepdims=True)
-    kl_divergence = tf.math.reduce_mean(kl_divergence, axis=None,
-                                        keepdims=False)
-    return kl_divergence
-
-  def build_losses(self, labels, outputs, metrics) -> tf.Tensor:
-    """Builds losses and update loss-related metrics for the current stage."""
-    last_stage = 'student_pretrainer_output' in outputs
-
-    # Layer-wise warmup stage
-    if not last_stage:
-      distill_config = self._progressive_config.layer_wise_distill_config
-      teacher_feature = outputs['teacher_output_feature']
-      student_feature = outputs['student_output_feature']
-
-      feature_transfer_loss = tf.keras.losses.mean_squared_error(
-          self._layer_norm(teacher_feature), self._layer_norm(student_feature))
-      feature_transfer_loss *= distill_config.hidden_distill_factor
-      beta_loss, gamma_loss = self._get_distribution_losses(teacher_feature,
-                                                            student_feature)
-      beta_loss *= distill_config.beta_distill_factor
-      gamma_loss *= distill_config.gamma_distill_factor
-      total_loss = feature_transfer_loss + beta_loss + gamma_loss
-
-      if distill_config.if_transfer_attention:
-        teacher_attention = outputs['teacher_attention_score']
-        student_attention = outputs['student_attention_score']
-        attention_loss = self._get_attention_loss(teacher_attention,
-                                                  student_attention)
-        attention_loss *= distill_config.attention_distill_factor
-        total_loss += attention_loss
-
-      total_loss /= tf.cast((self._stage_id + 1), tf.float32)
-
-    # Last stage to distill pretraining layer.
-    else:
-      distill_config = self._progressive_config.pretrain_distill_config
-      lm_label = labels['masked_lm_ids']
-      vocab_size = (
-          self._task_config.student_model.encoder.mobilebert.word_vocab_size)
-
-      # Shape: [batch, max_predictions_per_seq, vocab_size]
-      lm_label = tf.one_hot(indices=lm_label, depth=vocab_size, on_value=1.0,
-                            off_value=0.0, axis=-1, dtype=tf.float32)
-      gt_ratio = distill_config.distill_ground_truth_ratio
-      if gt_ratio != 1.0:
-        teacher_mlm_logits = outputs['teacher_pretrainer_output']['mlm_logits']
-        teacher_labels = tf.nn.softmax(teacher_mlm_logits, axis=-1)
-        lm_label = gt_ratio * lm_label + (1-gt_ratio) * teacher_labels
-
-      student_pretrainer_output = outputs['student_pretrainer_output']
-      # Shape: [batch, max_predictions_per_seq, vocab_size]
-      student_lm_log_probs = tf.nn.log_softmax(
-          student_pretrainer_output['mlm_logits'], axis=-1)
-
-      # Shape: [batch * max_predictions_per_seq]
-      per_example_loss = tf.reshape(
-          -tf.reduce_sum(student_lm_log_probs * lm_label, axis=[-1]), [-1])
-
-      lm_label_weights = tf.reshape(labels['masked_lm_weights'], [-1])
-      lm_numerator_loss = tf.reduce_sum(per_example_loss * lm_label_weights)
-      lm_denominator_loss = tf.reduce_sum(lm_label_weights)
-      mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
-      total_loss = mlm_loss
-
-      if 'next_sentence_labels' in labels:
-        sentence_labels = labels['next_sentence_labels']
-        sentence_outputs = tf.cast(
-            student_pretrainer_output['next_sentence'], dtype=tf.float32)
-        sentence_loss = tf.reduce_mean(
-            tf.keras.losses.sparse_categorical_crossentropy(
-                sentence_labels, sentence_outputs, from_logits=True))
-        total_loss += sentence_loss
-
-    # Also update loss-related metrics here, instead of in `process_metrics`.
-    metrics = dict([(metric.name, metric) for metric in metrics])
-
-    if not last_stage:
-      metrics['feature_transfer_mse'].update_state(feature_transfer_loss)
-      metrics['beta_transfer_loss'].update_state(beta_loss)
-      metrics['gamma_transfer_loss'].update_state(gamma_loss)
-      layer_wise_config = self._progressive_config.layer_wise_distill_config
-      if layer_wise_config.if_transfer_attention:
-        metrics['attention_transfer_loss'].update_state(attention_loss)
-    else:
-      metrics['lm_example_loss'].update_state(mlm_loss)
-      if 'next_sentence_labels' in labels:
-        metrics['next_sentence_loss'].update_state(sentence_loss)
-    metrics['total_loss'].update_state(total_loss)
-
-    return total_loss
-
-  # overrides base_task.Task
-  def build_metrics(self, training=None):
-    del training
-    metrics = [
-        tf.keras.metrics.Mean(name='feature_transfer_mse'),
-        tf.keras.metrics.Mean(name='beta_transfer_loss'),
-        tf.keras.metrics.Mean(name='gamma_transfer_loss'),
-        tf.keras.metrics.SparseCategoricalAccuracy(name='masked_lm_accuracy'),
-        tf.keras.metrics.Mean(name='lm_example_loss'),
-        tf.keras.metrics.Mean(name='total_loss')]
-    if self._progressive_config.layer_wise_distill_config.if_transfer_attention:
-      metrics.append(tf.keras.metrics.Mean(name='attention_transfer_loss'))
-    if self._task_config.train_data.use_next_sentence_label:
-      metrics.append(tf.keras.metrics.SparseCategoricalAccuracy(
-          name='next_sentence_accuracy'))
-      metrics.append(tf.keras.metrics.Mean(name='next_sentence_loss'))
-
-    return metrics
-
-  # overrides base_task.Task
-  # process non-loss metrics
-  def process_metrics(self, metrics, labels, student_pretrainer_output):
-    metrics = dict([(metric.name, metric) for metric in metrics])
-    # Final pretrainer layer distillation stage.
-    if student_pretrainer_output is not None:
-      if 'masked_lm_accuracy' in metrics:
-        metrics['masked_lm_accuracy'].update_state(
-            labels['masked_lm_ids'], student_pretrainer_output['mlm_logits'],
-            labels['masked_lm_weights'])
-      if 'next_sentence_accuracy' in metrics:
-        metrics['next_sentence_accuracy'].update_state(
-            labels['next_sentence_labels'],
-            student_pretrainer_output['next_sentence'])
-
-  # overrides base_task.Task
-  def train_step(self, inputs, model: tf.keras.Model,
-                 optimizer: tf.keras.optimizers.Optimizer, metrics):
-    """Does forward and backward.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the model, forward pass definition.
-      optimizer: the optimizer for this training step.
-      metrics: a nested structure of metrics objects.
-
-    Returns:
-      A dictionary of logs.
-    """
-    with tf.GradientTape() as tape:
-      outputs = model(inputs, training=True)
-
-      # Computes per-replica loss.
-      loss = self.build_losses(
-          labels=inputs,
-          outputs=outputs,
-          metrics=metrics)
-    # Scales loss as the default gradients allreduce performs sum inside the
-    # optimizer.
-    # TODO(b/154564893): enable loss scaling.
-    # scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
-
-    # get trainable variables for current stage
-    tvars = model.trainable_variables
-    last_stage = 'student_pretrainer_output' in outputs
-
-    grads = tape.gradient(loss, tvars)
-    optimizer.apply_gradients(list(zip(grads, tvars)))
-    self.process_metrics(
-        metrics, inputs,
-        outputs['student_pretrainer_output'] if last_stage else None)
-    return {self.loss: loss}
-
-  # overrides base_task.Task
-  def validation_step(self, inputs, model: tf.keras.Model, metrics):
-    """Validatation step.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the keras.Model.
-      metrics: a nested structure of metrics objects.
-
-    Returns:
-      A dictionary of logs.
-    """
-    outputs = model(inputs, training=False)
-    # Computes per-replica loss.
-    loss = self.build_losses(labels=inputs, outputs=outputs, metrics=metrics)
-    last_stage = 'student_pretrainer_output' in outputs
-    self.process_metrics(
-        metrics, inputs,
-        outputs['student_pretrainer_output'] if last_stage else None)
-    return {self.loss: loss}
-
-  @property
-  def cur_checkpoint_items(self):
-    """Checkpoints for model, stage_id, optimizer for preemption handling."""
-    return dict(
-        stage_id=self._stage_id,
-        volatiles=self._volatiles,
-        student_pretrainer=self._student_pretrainer,
-        teacher_pretrainer=self._teacher_pretrainer,
-        encoder=self._student_pretrainer.encoder_network)
-
-  def initialize(self, model):
-    """Loads teacher's pretrained checkpoint and copy student's embedding."""
-    # This function will be called when no checkpoint found for the model,
-    # i.e., when the training starts (not preemption case).
-    # The weights of teacher pretrainer and student pretrainer will be
-    # initialized, rather than the passed-in `model`.
-    del model
-    logging.info('Begin to load checkpoint for teacher pretrainer model.')
-    ckpt_dir_or_file = self._task_config.teacher_model_init_checkpoint
-    if not ckpt_dir_or_file:
-      raise ValueError('`teacher_model_init_checkpoint` is not specified.')
-
-    if tf.io.gfile.isdir(ckpt_dir_or_file):
-      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
-    # Makes sure the teacher pretrainer variables are created.
-    _ = self._teacher_pretrainer(self._teacher_pretrainer.inputs)
-    teacher_checkpoint = tf.train.Checkpoint(
-        **self._teacher_pretrainer.checkpoint_items)
-    teacher_checkpoint.read(ckpt_dir_or_file).assert_existing_objects_matched()
-
-    logging.info('Begin to copy word embedding from teacher model to student.')
-    teacher_encoder = self._teacher_pretrainer.encoder_network
-    student_encoder = self._student_pretrainer.encoder_network
-    embedding_weights = teacher_encoder.embedding_layer.get_weights()
-    student_encoder.embedding_layer.set_weights(embedding_weights)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/distillation_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/distillation_test.py
deleted file mode 100644
index 2a8a1591ebcedd4754739c3ae387230704a54d1a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/distillation_test.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.projects.mobilebert.distillation."""
-import os
-
-from absl import logging
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.core import config_definitions as cfg
-from official.modeling import optimization
-from official.modeling import tf_utils
-from official.modeling.progressive import trainer as prog_trainer_lib
-from official.nlp.configs import bert
-from official.nlp.configs import encoders
-from official.nlp.data import pretrain_dataloader
-from official.nlp.modeling import layers
-from official.nlp.modeling import models
-from official.nlp.projects.mobilebert import distillation
-
-
-class DistillationTest(tf.test.TestCase, parameterized.TestCase):
-
-  def prepare_config(self, teacher_block_num, student_block_num,
-                     transfer_teacher_layers):
-    # using small model for testing
-    task_config = distillation.BertDistillationTaskConfig(
-        teacher_model=bert.PretrainerConfig(
-            encoder=encoders.EncoderConfig(
-                type='mobilebert',
-                mobilebert=encoders.MobileBertEncoderConfig(
-                    num_blocks=teacher_block_num)),
-            cls_heads=[
-                bert.ClsHeadConfig(
-                    inner_dim=256,
-                    num_classes=2,
-                    dropout_rate=0.1,
-                    name='next_sentence')
-            ],
-            mlm_activation='gelu'),
-        student_model=bert.PretrainerConfig(
-            encoder=encoders.EncoderConfig(
-                type='mobilebert',
-                mobilebert=encoders.MobileBertEncoderConfig(
-                    num_blocks=student_block_num)),
-            cls_heads=[
-                bert.ClsHeadConfig(
-                    inner_dim=256,
-                    num_classes=2,
-                    dropout_rate=0.1,
-                    name='next_sentence')
-            ],
-            mlm_activation='relu'),
-        train_data=pretrain_dataloader.BertPretrainDataConfig(
-            input_path='dummy',
-            max_predictions_per_seq=76,
-            seq_length=512,
-            global_batch_size=10),
-        validation_data=pretrain_dataloader.BertPretrainDataConfig(
-            input_path='dummy',
-            max_predictions_per_seq=76,
-            seq_length=512,
-            global_batch_size=10))
-
-    # set only 1 step for each stage
-    progressive_config = distillation.BertDistillationProgressiveConfig()
-    progressive_config.layer_wise_distill_config.transfer_teacher_layers = (
-        transfer_teacher_layers)
-    progressive_config.layer_wise_distill_config.num_steps = 1
-    progressive_config.pretrain_distill_config.num_steps = 1
-
-    optimization_config = optimization.OptimizationConfig(
-        optimizer=optimization.OptimizerConfig(
-            type='lamb',
-            lamb=optimization.LAMBConfig(
-                weight_decay_rate=0.0001,
-                exclude_from_weight_decay=[
-                    'LayerNorm', 'layer_norm', 'bias', 'no_norm'
-                ])),
-        learning_rate=optimization.LrConfig(
-            type='polynomial',
-            polynomial=optimization.PolynomialLrConfig(
-                initial_learning_rate=1.5e-3,
-                decay_steps=10000,
-                end_learning_rate=1.5e-3)),
-        warmup=optimization.WarmupConfig(
-            type='linear',
-            linear=optimization.LinearWarmupConfig(warmup_learning_rate=0)))
-
-    exp_config = cfg.ExperimentConfig(
-        task=task_config,
-        trainer=prog_trainer_lib.ProgressiveTrainerConfig(
-            progressive=progressive_config,
-            optimizer_config=optimization_config))
-
-    # Create a teacher model checkpoint.
-    teacher_encoder = encoders.build_encoder(task_config.teacher_model.encoder)
-    pretrainer_config = task_config.teacher_model
-    if pretrainer_config.cls_heads:
-      teacher_cls_heads = [
-          layers.ClassificationHead(**cfg.as_dict())
-          for cfg in pretrainer_config.cls_heads
-      ]
-    else:
-      teacher_cls_heads = []
-
-    masked_lm = layers.MobileBertMaskedLM(
-        embedding_table=teacher_encoder.get_embedding_table(),
-        activation=tf_utils.get_activation(pretrainer_config.mlm_activation),
-        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=pretrainer_config.mlm_initializer_range),
-        name='cls/predictions')
-    teacher_pretrainer = models.BertPretrainerV2(
-        encoder_network=teacher_encoder,
-        classification_heads=teacher_cls_heads,
-        customized_masked_lm=masked_lm)
-
-    # The model variables will be created after the forward call.
-    _ = teacher_pretrainer(teacher_pretrainer.inputs)
-    teacher_pretrainer_ckpt = tf.train.Checkpoint(
-        **teacher_pretrainer.checkpoint_items)
-    teacher_ckpt_path = os.path.join(self.get_temp_dir(), 'teacher_model.ckpt')
-    teacher_pretrainer_ckpt.save(teacher_ckpt_path)
-    exp_config.task.teacher_model_init_checkpoint = self.get_temp_dir()
-
-    return exp_config
-
-  @parameterized.parameters((2, 2, None), (4, 2, [1, 3]))
-  def test_task(self, teacher_block_num, student_block_num,
-                transfer_teacher_layers):
-    exp_config = self.prepare_config(teacher_block_num, student_block_num,
-                                     transfer_teacher_layers)
-    bert_distillation_task = distillation.BertDistillationTask(
-        strategy=tf.distribute.get_strategy(),
-        progressive=exp_config.trainer.progressive,
-        optimizer_config=exp_config.trainer.optimizer_config,
-        task_config=exp_config.task)
-    metrics = bert_distillation_task.build_metrics()
-    train_dataset = bert_distillation_task.get_train_dataset(stage_id=0)
-    train_iterator = iter(train_dataset)
-
-    eval_dataset = bert_distillation_task.get_eval_dataset(stage_id=0)
-    eval_iterator = iter(eval_dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-
-    # test train/val step for all stages, including the last pretraining stage
-    for stage in range(student_block_num + 1):
-      step = stage
-      bert_distillation_task.update_pt_stage(step)
-      model = bert_distillation_task.get_model(stage, None)
-      bert_distillation_task.initialize(model)
-      bert_distillation_task.train_step(next(train_iterator), model, optimizer,
-                                        metrics=metrics)
-      bert_distillation_task.validation_step(next(eval_iterator), model,
-                                             metrics=metrics)
-
-    logging.info('begin to save and load model checkpoint')
-    ckpt = tf.train.Checkpoint(model=model)
-    ckpt.save(self.get_temp_dir())
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/experiments/en_uncased_student.yaml b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/experiments/en_uncased_student.yaml
deleted file mode 100644
index cfcf40c2b89e2d85763785235ae661e6774e47c3..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/experiments/en_uncased_student.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-task:
-  model:
-    encoder:
-      type: mobilebert
-      mobilebert:
-        word_vocab_size: 30522
-        word_embed_size: 128
-        type_vocab_size: 2
-        max_sequence_length: 512
-        num_blocks: 24
-        hidden_size: 512
-        num_attention_heads: 4
-        intermediate_size: 512
-        hidden_activation: relu
-        hidden_dropout_prob: 0.0
-        attention_probs_dropout_prob: 0.1
-        intra_bottleneck_size: 128
-        initializer_range: 0.02
-        key_query_shared_bottleneck: true
-        num_feedforward_networks: 4
-        normalization_type: no_norm
-        classifier_activation: false
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/experiments/en_uncased_teacher.yaml b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/experiments/en_uncased_teacher.yaml
deleted file mode 100644
index eeee8537da5dbdd55c7effdb5bc6bdab24339bc6..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/experiments/en_uncased_teacher.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-task:
-  model:
-    encoder:
-      type: mobilebert
-      mobilebert:
-        word_vocab_size: 30522
-        word_embed_size: 128
-        type_vocab_size: 2
-        max_sequence_length: 512
-        num_blocks: 24
-        hidden_size: 512
-        num_attention_heads: 4
-        intermediate_size: 4096
-        hidden_activation: gelu
-        hidden_dropout_prob: 0.1
-        attention_probs_dropout_prob: 0.1
-        intra_bottleneck_size: 1024
-        initializer_range: 0.02
-        key_query_shared_bottleneck: false
-        num_feedforward_networks: 1
-        normalization_type: layer_norm
-        classifier_activation: false
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/experiments/mobilebert_distillation_en_uncased.yaml b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/experiments/mobilebert_distillation_en_uncased.yaml
deleted file mode 100644
index 74e6adc3c47181747161eadcdfe40ca6d95d80e3..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/experiments/mobilebert_distillation_en_uncased.yaml
+++ /dev/null
@@ -1,79 +0,0 @@
-task:
-  train_data:
-    drop_remainder: true
-    global_batch_size: 2048
-    input_path: ""
-    is_training: true
-    max_predictions_per_seq: 20
-    seq_length: 512
-    use_next_sentence_label: true
-    use_position_id: false
-  validation_data:
-    drop_remainder: true
-    global_batch_size: 2048
-    input_path: ""
-    is_training: false
-    max_predictions_per_seq: 20
-    seq_length: 512
-    use_next_sentence_label: true
-    use_position_id: false
-  teacher_model:
-    cls_heads: []
-    mlm_activation: gelu
-    mlm_initializer_range: 0.02
-    encoder:
-      type: mobilebert
-      mobilebert:
-        word_vocab_size: 30522
-        word_embed_size: 128
-        type_vocab_size: 2
-        max_sequence_length: 512
-        num_blocks: 24
-        hidden_size: 512
-        num_attention_heads: 4
-        intermediate_size: 4096
-        hidden_activation: gelu
-        hidden_dropout_prob: 0.1
-        attention_probs_dropout_prob: 0.1
-        intra_bottleneck_size: 1024
-        initializer_range: 0.02
-        key_query_shared_bottleneck: false
-        num_feedforward_networks: 1
-        normalization_type: layer_norm
-        classifier_activation: false
-  student_model:
-    cls_heads: [{activation: tanh, cls_token_idx: 0, dropout_rate: 0.0, inner_dim: 512,
-                 name: next_sentence, num_classes: 2}]
-    mlm_activation: relu
-    mlm_initializer_range: 0.02
-    encoder:
-      type: mobilebert
-      mobilebert:
-        word_vocab_size: 30522
-        word_embed_size: 128
-        type_vocab_size: 2
-        max_sequence_length: 512
-        num_blocks: 24
-        hidden_size: 512
-        num_attention_heads: 4
-        intermediate_size: 512
-        hidden_activation: relu
-        hidden_dropout_prob: 0.0
-        attention_probs_dropout_prob: 0.1
-        intra_bottleneck_size: 128
-        initializer_range: 0.02
-        key_query_shared_bottleneck: true
-        num_feedforward_networks: 4
-        normalization_type: no_norm
-        classifier_activation: false
-  teacher_model_init_checkpoint: ""
-trainer:
-  progressive:
-    if_copy_embeddings: true
-    layer_wise_distill_config:
-      num_steps: 10000
-    pretrain_distill_config:
-      num_steps: 500000
-      decay_steps: 500000
-  train_steps: 740000
-  max_to_keep: 10
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/export_tfhub.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/export_tfhub.py
deleted file mode 100644
index 111383e3f8bb22c3253541ac0b6f415649e6797a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/export_tfhub.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A script to export the MobileBERT encoder model as a TF-Hub SavedModel."""
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-
-from official.nlp.projects.mobilebert import model_utils
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string(
-    "bert_config_file", None,
-    "Bert configuration file to define core mobilebert layers.")
-flags.DEFINE_string("model_checkpoint_path", None,
-                    "File path to TF model checkpoint.")
-flags.DEFINE_string("export_path", None, "TF-Hub SavedModel destination path.")
-flags.DEFINE_string("vocab_file", None,
-                    "The vocabulary file that the BERT model was trained on.")
-flags.DEFINE_bool("do_lower_case", True, "Whether to lowercase.")
-
-
-def create_mobilebert_model(bert_config):
-  """Creates a model for exporting to tfhub."""
-  pretrainer = model_utils.create_mobilebert_pretrainer(bert_config)
-  encoder = pretrainer.encoder_network
-  encoder_inputs_dict = {x.name: x for x in encoder.inputs}
-  encoder_output_dict = encoder(encoder_inputs_dict)
-
-  # For interchangeability with other text representations,
-  # add "default" as an alias for MobileBERT's whole-input reptesentations.
-  encoder_output_dict["default"] = encoder_output_dict["pooled_output"]
-  core_model = tf.keras.Model(
-      inputs=encoder_inputs_dict, outputs=encoder_output_dict)
-
-  pretrainer_inputs_dict = {x.name: x for x in pretrainer.inputs}
-  pretrainer_output_dict = pretrainer(pretrainer_inputs_dict)
-  mlm_model = tf.keras.Model(
-      inputs=pretrainer_inputs_dict, outputs=pretrainer_output_dict)
-  # Set `_auto_track_sub_layers` to False, so that the additional weights
-  # from `mlm` sub-object will not be included in the core model.
-  # TODO(b/169210253): Use public API after the bug is resolved.
-  core_model._auto_track_sub_layers = False  # pylint: disable=protected-access
-  core_model.mlm = mlm_model
-  return core_model, pretrainer
-
-
-def export_bert_tfhub(bert_config, model_checkpoint_path, hub_destination,
-                      vocab_file, do_lower_case):
-  """Restores a tf.keras.Model and saves for TF-Hub."""
-  core_model, pretrainer = create_mobilebert_model(bert_config)
-  checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
-
-  logging.info("Begin to load model")
-  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
-  logging.info("Loading model finished")
-  core_model.vocab_file = tf.saved_model.Asset(vocab_file)
-  core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
-  logging.info("Begin to save files for tfhub at %s", hub_destination)
-  core_model.save(hub_destination, include_optimizer=False, save_format="tf")
-  logging.info("tfhub files exported!")
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
-  bert_config = model_utils.BertConfig.from_json_file(FLAGS.bert_config_file)
-  export_bert_tfhub(bert_config, FLAGS.model_checkpoint_path, FLAGS.export_path,
-                    FLAGS.vocab_file, FLAGS.do_lower_case)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/model_utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/model_utils.py
deleted file mode 100644
index 717ecc51de76ab88a7c05cecec2ec947dad727a5..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/model_utils.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Checkpoint converter for Mobilebert."""
-import copy
-import json
-
-import tensorflow.compat.v1 as tf
-
-from official.modeling import tf_utils
-from official.nlp.modeling import layers
-from official.nlp.modeling import models
-from official.nlp.modeling import networks
-
-
-class BertConfig(object):
-  """Configuration for `BertModel`."""
-
-  def __init__(self,
-               vocab_size,
-               hidden_size=768,
-               num_hidden_layers=12,
-               num_attention_heads=12,
-               intermediate_size=3072,
-               hidden_act="gelu",
-               hidden_dropout_prob=0.1,
-               attention_probs_dropout_prob=0.1,
-               max_position_embeddings=512,
-               type_vocab_size=16,
-               initializer_range=0.02,
-               embedding_size=None,
-               trigram_input=False,
-               use_bottleneck=False,
-               intra_bottleneck_size=None,
-               use_bottleneck_attention=False,
-               key_query_shared_bottleneck=False,
-               num_feedforward_networks=1,
-               normalization_type="layer_norm",
-               classifier_activation=True):
-    """Constructs BertConfig.
-
-    Args:
-      vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
-      hidden_size: Size of the encoder layers and the pooler layer.
-      num_hidden_layers: Number of hidden layers in the Transformer encoder.
-      num_attention_heads: Number of attention heads for each attention layer in
-        the Transformer encoder.
-      intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-        layer in the Transformer encoder.
-      hidden_act: The non-linear activation function (function or string) in the
-        encoder and pooler.
-      hidden_dropout_prob: The dropout probability for all fully connected
-        layers in the embeddings, encoder, and pooler.
-      attention_probs_dropout_prob: The dropout ratio for the attention
-        probabilities.
-      max_position_embeddings: The maximum sequence length that this model might
-        ever be used with. Typically set this to something large just in case
-        (e.g., 512 or 1024 or 2048).
-      type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-        `BertModel`.
-      initializer_range: The stdev of the truncated_normal_initializer for
-        initializing all weight matrices.
-      embedding_size: The size of the token embedding.
-      trigram_input: Use a convolution of trigram as input.
-      use_bottleneck: Use the bottleneck/inverted-bottleneck structure in BERT.
-      intra_bottleneck_size: The hidden size in the bottleneck.
-      use_bottleneck_attention: Use attention inputs from the bottleneck
-        transformation.
-      key_query_shared_bottleneck: Use the same linear transformation for
-        query&key in the bottleneck.
-      num_feedforward_networks: Number of FFNs in a block.
-      normalization_type: The normalization type in BERT.
-      classifier_activation: Using the tanh activation for the final
-        representation of the [CLS] token in fine-tuning.
-    """
-    self.vocab_size = vocab_size
-    self.hidden_size = hidden_size
-    self.num_hidden_layers = num_hidden_layers
-    self.num_attention_heads = num_attention_heads
-    self.hidden_act = hidden_act
-    self.intermediate_size = intermediate_size
-    self.hidden_dropout_prob = hidden_dropout_prob
-    self.attention_probs_dropout_prob = attention_probs_dropout_prob
-    self.max_position_embeddings = max_position_embeddings
-    self.type_vocab_size = type_vocab_size
-    self.initializer_range = initializer_range
-    self.embedding_size = embedding_size
-    self.trigram_input = trigram_input
-    self.use_bottleneck = use_bottleneck
-    self.intra_bottleneck_size = intra_bottleneck_size
-    self.use_bottleneck_attention = use_bottleneck_attention
-    self.key_query_shared_bottleneck = key_query_shared_bottleneck
-    self.num_feedforward_networks = num_feedforward_networks
-    self.normalization_type = normalization_type
-    self.classifier_activation = classifier_activation
-
-  @classmethod
-  def from_dict(cls, json_object):
-    """Constructs a `BertConfig` from a Python dictionary of parameters."""
-    config = BertConfig(vocab_size=None)
-    for (key, value) in json_object.items():
-      config.__dict__[key] = value
-    if config.embedding_size is None:
-      config.embedding_size = config.hidden_size
-    if config.intra_bottleneck_size is None:
-      config.intra_bottleneck_size = config.hidden_size
-    return config
-
-  @classmethod
-  def from_json_file(cls, json_file):
-    """Constructs a `BertConfig` from a json file of parameters."""
-    with tf.gfile.GFile(json_file, "r") as reader:
-      text = reader.read()
-    return cls.from_dict(json.loads(text))
-
-  def to_dict(self):
-    """Serializes this instance to a Python dictionary."""
-    output = copy.deepcopy(self.__dict__)
-    return output
-
-  def to_json_string(self):
-    """Serializes this instance to a JSON string."""
-    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-
-def create_mobilebert_pretrainer(bert_config):
-  """Creates a BertPretrainerV2 that wraps MobileBERTEncoder model."""
-  mobilebert_encoder = networks.MobileBERTEncoder(
-      word_vocab_size=bert_config.vocab_size,
-      word_embed_size=bert_config.embedding_size,
-      type_vocab_size=bert_config.type_vocab_size,
-      max_sequence_length=bert_config.max_position_embeddings,
-      num_blocks=bert_config.num_hidden_layers,
-      hidden_size=bert_config.hidden_size,
-      num_attention_heads=bert_config.num_attention_heads,
-      intermediate_size=bert_config.intermediate_size,
-      intermediate_act_fn=tf_utils.get_activation(bert_config.hidden_act),
-      hidden_dropout_prob=bert_config.hidden_dropout_prob,
-      attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob,
-      intra_bottleneck_size=bert_config.intra_bottleneck_size,
-      initializer_range=bert_config.initializer_range,
-      use_bottleneck_attention=bert_config.use_bottleneck_attention,
-      key_query_shared_bottleneck=bert_config.key_query_shared_bottleneck,
-      num_feedforward_networks=bert_config.num_feedforward_networks,
-      normalization_type=bert_config.normalization_type,
-      classifier_activation=bert_config.classifier_activation)
-
-  masked_lm = layers.MobileBertMaskedLM(
-      embedding_table=mobilebert_encoder.get_embedding_table(),
-      activation=tf_utils.get_activation(bert_config.hidden_act),
-      initializer=tf.keras.initializers.TruncatedNormal(
-          stddev=bert_config.initializer_range),
-      name="cls/predictions")
-
-  pretrainer = models.BertPretrainerV2(
-      encoder_network=mobilebert_encoder, customized_masked_lm=masked_lm)
-  # Makes sure the pretrainer variables are created.
-  _ = pretrainer(pretrainer.inputs)
-  return pretrainer
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/run_distillation.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/run_distillation.py
deleted file mode 100644
index bf55b43c94b8c3d20bf4d5bc651c1060e0afde36..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/run_distillation.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# pylint: disable=line-too-long
-"""Creating the task and start trainer."""
-
-import pprint
-
-from absl import app
-from absl import flags
-from absl import logging
-import gin
-from official.common import distribute_utils
-from official.common import flags as tfm_flags
-from official.core import config_definitions as cfg
-from official.core import train_utils
-from official.modeling import hyperparams
-from official.modeling import optimization
-from official.modeling import performance
-from official.modeling.progressive import train_lib
-from official.modeling.progressive import trainer as prog_trainer_lib
-from official.nlp.data import pretrain_dataloader
-from official.nlp.projects.mobilebert import distillation
-
-
-FLAGS = flags.FLAGS
-
-optimization_config = optimization.OptimizationConfig(
-    optimizer=optimization.OptimizerConfig(
-        type='lamb',
-        lamb=optimization.LAMBConfig(
-            weight_decay_rate=0.01,
-            exclude_from_weight_decay=['LayerNorm', 'bias', 'norm'],
-            clipnorm=1.0)),
-    learning_rate=optimization.LrConfig(
-        type='polynomial',
-        polynomial=optimization.PolynomialLrConfig(
-            initial_learning_rate=1.5e-3,
-            decay_steps=10000,
-            end_learning_rate=1.5e-3)),
-    warmup=optimization.WarmupConfig(
-        type='linear',
-        linear=optimization.LinearWarmupConfig(warmup_learning_rate=0)))
-
-
-# copy from progressive/utils.py due to the private visibility issue.
-def config_override(params, flags_obj):
-  """Override ExperimentConfig according to flags."""
-  # Change runtime.tpu to the real tpu.
-  params.override({
-      'runtime': {
-          'tpu': flags_obj.tpu,
-      }
-  })
-
-  # Get the first level of override from `--config_file`.
-  #   `--config_file` is typically used as a template that specifies the common
-  #   override for a particular experiment.
-  for config_file in flags_obj.config_file or []:
-    params = hyperparams.override_params_dict(
-        params, config_file, is_strict=True)
-
-  # Get the second level of override from `--params_override`.
-  #   `--params_override` is typically used as a further override over the
-  #   template. For example, one may define a particular template for training
-  #   ResNet50 on ImageNet in a config file and pass it via `--config_file`,
-  #   then define different learning rates and pass it via `--params_override`.
-  if flags_obj.params_override:
-    params = hyperparams.override_params_dict(
-        params, flags_obj.params_override, is_strict=True)
-
-  params.validate()
-  params.lock()
-
-  pp = pprint.PrettyPrinter()
-  logging.info('Final experiment parameters: %s', pp.pformat(params.as_dict()))
-
-  model_dir = flags_obj.model_dir
-  if 'train' in flags_obj.mode:
-    # Pure eval modes do not output yaml files. Otherwise continuous eval job
-    # may race against the train job for writing the same file.
-    train_utils.serialize_config(params, model_dir)
-
-  return params
-
-
-def get_exp_config():
-  """Get ExperimentConfig."""
-  params = cfg.ExperimentConfig(
-      task=distillation.BertDistillationTaskConfig(
-          train_data=pretrain_dataloader.BertPretrainDataConfig(),
-          validation_data=pretrain_dataloader.BertPretrainDataConfig(
-              is_training=False)),
-      trainer=prog_trainer_lib.ProgressiveTrainerConfig(
-          progressive=distillation.BertDistillationProgressiveConfig(),
-          optimizer_config=optimization_config,
-          train_steps=740000,
-          checkpoint_interval=20000))
-
-  return config_override(params, FLAGS)
-
-
-def main(_):
-  logging.info('Parsing config files...')
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
-  params = get_exp_config()
-
-  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
-  # can have significant impact on model speeds by utilizing float16 in case of
-  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
-  # dtype is float16
-  if params.runtime.mixed_precision_dtype:
-    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
-  distribution_strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=params.runtime.distribution_strategy,
-      all_reduce_alg=params.runtime.all_reduce_alg,
-      num_gpus=params.runtime.num_gpus,
-      tpu_address=params.runtime.tpu)
-
-  with distribution_strategy.scope():
-    task = distillation.BertDistillationTask(
-        strategy=distribution_strategy,
-        progressive=params.trainer.progressive,
-        optimizer_config=params.trainer.optimizer_config,
-        task_config=params.task)
-
-  train_lib.run_experiment(
-      distribution_strategy=distribution_strategy,
-      task=task,
-      mode=FLAGS.mode,
-      params=params,
-      model_dir=FLAGS.model_dir)
-
-if __name__ == '__main__':
-  tfm_flags.define_flags()
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/tf2_model_checkpoint_converter.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/tf2_model_checkpoint_converter.py
deleted file mode 100644
index 19308cbbd063204e45fb48cdbf4a30e2c138bbb0..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/tf2_model_checkpoint_converter.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Checkpoint converter for Mobilebert."""
-import os
-
-from absl import app
-from absl import flags
-from absl import logging
-import numpy as np
-import tensorflow.compat.v1 as tf
-
-from official.nlp.projects.mobilebert import model_utils
-
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string(
-    "bert_config_file", None,
-    "Bert configuration file to define core mobilebert layers.")
-flags.DEFINE_string("tf1_checkpoint_path", None,
-                    "Path to load tf1 checkpoint.")
-flags.DEFINE_string("tf2_checkpoint_path", None,
-                    "Path to save tf2 checkpoint.")
-flags.DEFINE_boolean("use_model_prefix", False,
-                     ("If use model name as prefix for variables. Turn this"
-                      "flag on when the converted checkpoint is used for model"
-                      "in subclass implementation, which uses the model name as"
-                      "prefix for all variable names."))
-
-
-def _bert_name_replacement(var_name, name_replacements):
-  """Gets the variable name replacement."""
-  for src_pattern, tgt_pattern in name_replacements:
-    if src_pattern in var_name:
-      old_var_name = var_name
-      var_name = var_name.replace(src_pattern, tgt_pattern)
-      logging.info("Converted: %s --> %s", old_var_name, var_name)
-  return var_name
-
-
-def _has_exclude_patterns(name, exclude_patterns):
-  """Checks if a string contains substrings that match patterns to exclude."""
-  for p in exclude_patterns:
-    if p in name:
-      return True
-  return False
-
-
-def _get_permutation(name, permutations):
-  """Checks whether a variable requires transposition by pattern matching."""
-  for src_pattern, permutation in permutations:
-    if src_pattern in name:
-      logging.info("Permuted: %s --> %s", name, permutation)
-      return permutation
-
-  return None
-
-
-def _get_new_shape(name, shape, num_heads):
-  """Checks whether a variable requires reshape by pattern matching."""
-  if "attention/attention_output/kernel" in name:
-    return tuple([num_heads, shape[0] // num_heads, shape[1]])
-  if "attention/attention_output/bias" in name:
-    return shape
-
-  patterns = [
-      "attention/query", "attention/value", "attention/key"
-  ]
-  for pattern in patterns:
-    if pattern in name:
-      if "kernel" in name:
-        return tuple([shape[0], num_heads, shape[1] // num_heads])
-      if "bias" in name:
-        return tuple([num_heads, shape[0] // num_heads])
-  return None
-
-
-def convert(checkpoint_from_path,
-            checkpoint_to_path,
-            name_replacements,
-            permutations,
-            bert_config,
-            exclude_patterns=None):
-  """Migrates the names of variables within a checkpoint.
-
-  Args:
-    checkpoint_from_path: Path to source checkpoint to be read in.
-    checkpoint_to_path: Path to checkpoint to be written out.
-    name_replacements: A list of tuples of the form (match_str, replace_str)
-      describing variable names to adjust.
-    permutations: A list of tuples of the form (match_str, permutation)
-      describing permutations to apply to given variables. Note that match_str
-      should match the original variable name, not the replaced one.
-    bert_config: A `BertConfig` to create the core model.
-    exclude_patterns: A list of string patterns to exclude variables from
-      checkpoint conversion.
-
-  Returns:
-    A dictionary that maps the new variable names to the Variable objects.
-    A dictionary that maps the old variable names to the new variable names.
-  """
-  last_ffn_layer_id = str(bert_config.num_feedforward_networks - 1)
-  name_replacements = [
-      (x[0], x[1].replace("LAST_FFN_LAYER_ID", last_ffn_layer_id))
-      for x in name_replacements
-  ]
-
-  output_dir, _ = os.path.split(checkpoint_to_path)
-  tf.io.gfile.makedirs(output_dir)
-  # Create a temporary V1 name-converted checkpoint in the output directory.
-  temporary_checkpoint_dir = os.path.join(output_dir, "temp_v1")
-  temporary_checkpoint = os.path.join(temporary_checkpoint_dir, "ckpt")
-
-  with tf.Graph().as_default():
-    logging.info("Reading checkpoint_from_path %s", checkpoint_from_path)
-    reader = tf.train.NewCheckpointReader(checkpoint_from_path)
-    name_shape_map = reader.get_variable_to_shape_map()
-    new_variable_map = {}
-    conversion_map = {}
-    for var_name in name_shape_map:
-      if exclude_patterns and _has_exclude_patterns(var_name, exclude_patterns):
-        continue
-      # Get the original tensor data.
-      tensor = reader.get_tensor(var_name)
-
-      # Look up the new variable name, if any.
-      new_var_name = _bert_name_replacement(var_name, name_replacements)
-
-      # See if we need to reshape the underlying tensor.
-      new_shape = None
-      if bert_config.num_attention_heads > 0:
-        new_shape = _get_new_shape(new_var_name, tensor.shape,
-                                   bert_config.num_attention_heads)
-      if new_shape:
-        logging.info("Veriable %s has a shape change from %s to %s",
-                     var_name, tensor.shape, new_shape)
-        tensor = np.reshape(tensor, new_shape)
-
-      # See if we need to permute the underlying tensor.
-      permutation = _get_permutation(var_name, permutations)
-      if permutation:
-        tensor = np.transpose(tensor, permutation)
-
-      # Create a new variable with the possibly-reshaped or transposed tensor.
-      var = tf.Variable(tensor, name=var_name)
-
-      # Save the variable into the new variable map.
-      new_variable_map[new_var_name] = var
-
-      # Keep a list of converter variables for sanity checking.
-      if new_var_name != var_name:
-        conversion_map[var_name] = new_var_name
-
-    saver = tf.train.Saver(new_variable_map)
-
-    with tf.Session() as sess:
-      sess.run(tf.global_variables_initializer())
-      logging.info("Writing checkpoint_to_path %s", temporary_checkpoint)
-      saver.save(sess, temporary_checkpoint, write_meta_graph=False)
-
-  logging.info("Summary:")
-  logging.info("Converted %d variable name(s).", len(new_variable_map))
-  logging.info("Converted: %s", str(conversion_map))
-
-  mobilebert_model = model_utils.create_mobilebert_pretrainer(bert_config)
-  create_v2_checkpoint(
-      mobilebert_model, temporary_checkpoint, checkpoint_to_path)
-
-  # Clean up the temporary checkpoint, if it exists.
-  try:
-    tf.io.gfile.rmtree(temporary_checkpoint_dir)
-  except tf.errors.OpError:
-    # If it doesn't exist, we don't need to clean it up; continue.
-    pass
-
-
-def create_v2_checkpoint(model, src_checkpoint, output_path):
-  """Converts a name-based matched TF V1 checkpoint to TF V2 checkpoint."""
-  # Uses streaming-restore in eager model to read V1 name-based checkpoints.
-  model.load_weights(src_checkpoint).assert_existing_objects_matched()
-  checkpoint = tf.train.Checkpoint(**model.checkpoint_items)
-  checkpoint.save(output_path)
-
-
-_NAME_REPLACEMENT = [
-    # prefix path replacement
-    ("bert/", "mobile_bert_encoder/"),
-    ("encoder/layer_", "transformer_layer_"),
-
-    # embedding layer
-    ("embeddings/embedding_transformation",
-     "mobile_bert_embedding/embedding_projection"),
-    ("embeddings/position_embeddings",
-     "mobile_bert_embedding/position_embedding/embeddings"),
-    ("embeddings/token_type_embeddings",
-     "mobile_bert_embedding/type_embedding/embeddings"),
-    ("embeddings/word_embeddings",
-     "mobile_bert_embedding/word_embedding/embeddings"),
-    ("embeddings/FakeLayerNorm", "mobile_bert_embedding/embedding_norm"),
-    ("embeddings/LayerNorm", "mobile_bert_embedding/embedding_norm"),
-
-    # attention layer
-    ("attention/output/dense", "attention/attention_output"),
-    ("attention/output/FakeLayerNorm", "attention/norm"),
-    ("attention/output/LayerNorm", "attention/norm"),
-    ("attention/self", "attention"),
-
-    # input bottleneck
-    ("bottleneck/input/dense", "bottleneck_input/dense"),
-    ("bottleneck/input/FakeLayerNorm", "bottleneck_input/norm"),
-    ("bottleneck/input/LayerNorm", "bottleneck_input/norm"),
-    ("bottleneck/attention/dense", "kq_shared_bottleneck/dense"),
-    ("bottleneck/attention/FakeLayerNorm", "kq_shared_bottleneck/norm"),
-    ("bottleneck/attention/LayerNorm", "kq_shared_bottleneck/norm"),
-
-    # ffn layer
-    ("ffn_layer_0/output/dense", "ffn_layer_0/output_dense"),
-    ("ffn_layer_1/output/dense", "ffn_layer_1/output_dense"),
-    ("ffn_layer_2/output/dense", "ffn_layer_2/output_dense"),
-    ("output/dense", "ffn_layer_LAST_FFN_LAYER_ID/output_dense"),
-    ("ffn_layer_0/output/FakeLayerNorm", "ffn_layer_0/norm"),
-    ("ffn_layer_0/output/LayerNorm", "ffn_layer_0/norm"),
-    ("ffn_layer_1/output/FakeLayerNorm", "ffn_layer_1/norm"),
-    ("ffn_layer_1/output/LayerNorm", "ffn_layer_1/norm"),
-    ("ffn_layer_2/output/FakeLayerNorm", "ffn_layer_2/norm"),
-    ("ffn_layer_2/output/LayerNorm", "ffn_layer_2/norm"),
-    ("output/FakeLayerNorm", "ffn_layer_LAST_FFN_LAYER_ID/norm"),
-    ("output/LayerNorm", "ffn_layer_LAST_FFN_LAYER_ID/norm"),
-    ("ffn_layer_0/intermediate/dense", "ffn_layer_0/intermediate_dense"),
-    ("ffn_layer_1/intermediate/dense", "ffn_layer_1/intermediate_dense"),
-    ("ffn_layer_2/intermediate/dense", "ffn_layer_2/intermediate_dense"),
-    ("intermediate/dense", "ffn_layer_LAST_FFN_LAYER_ID/intermediate_dense"),
-
-    # output bottleneck
-    ("output/bottleneck/FakeLayerNorm", "bottleneck_output/norm"),
-    ("output/bottleneck/LayerNorm", "bottleneck_output/norm"),
-    ("output/bottleneck/dense", "bottleneck_output/dense"),
-
-    # pooler layer
-    ("pooler/dense", "pooler"),
-
-    # MLM layer
-    ("cls/predictions", "bert/cls/predictions"),
-    ("cls/predictions/output_bias", "cls/predictions/output_bias/bias")
-]
-
-_EXCLUDE_PATTERNS = ["cls/seq_relationship", "global_step"]
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
-
-  if not FLAGS.use_model_prefix:
-    _NAME_REPLACEMENT[0] = ("bert/", "")
-
-  bert_config = model_utils.BertConfig.from_json_file(FLAGS.bert_config_file)
-  convert(FLAGS.tf1_checkpoint_path,
-          FLAGS.tf2_checkpoint_path,
-          _NAME_REPLACEMENT,
-          [],
-          bert_config,
-          _EXCLUDE_PATTERNS)
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/utils.py
deleted file mode 100644
index 375fe45a3087f262f8ca17210b6096c7a1437b71..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/mobilebert/utils.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Utility functions."""
-
-import numpy as np
-
-
-def generate_fake_input(batch_size=1, seq_len=5, vocab_size=10000, seed=0):
-  """Generate consistent fake integer input sequences."""
-  np.random.seed(seed)
-  fake_input = []
-  for _ in range(batch_size):
-    fake_input.append([])
-    for _ in range(seq_len):
-      fake_input[-1].append(np.random.randint(0, vocab_size))
-  fake_input = np.asarray(fake_input)
-  return fake_input
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/tn_bert/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/tn_bert/README.md
deleted file mode 100644
index 50928155807d1800220f14ff67d6408e50a4012c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/tn_bert/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# TN-BERT (TensorNetwork BERT)
-
-TN-BERT is a modification of the BERT-base architecture that greatly compresses
-the original BERT model using tensor networks. The dense feedforward layers are
-replaced with Expand / Condense tn layers tuned to the TPU architecture.
-
-This work is based on research conducted during the development of the
-[TensorNetwork](https://arxiv.org/abs/1905.01330) Library. Check it out on
-[github](https://github.com/google/TensorNetwork).
-
-TN-BERT achieves the following improvements:
-
-*   69M params, or 37% fewer than the original BERT base.
-
-*   22% faster inference than the baseline model on TPUs.
-
-*   Pre-training time under 8 hours on an 8x8 pod of TPUs.
-
-*   15% less energy consumption by accellerators
-
-For more information go to the TF Hub model page
-[here](https://tfhub.dev/google/tn_bert/1)
-
-### Implementation
-
-The expand_condense and transformer layers are the only components that differ
-from the reference BERT implementation. These layers can be viewed at:
-
-* [tn_transformer_expand_condense.py](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/tn_transformer_expand_condense.py)
-
-* [tn_expand_condense.py](https://github.com/tensorflow/models/blob/master/official/nlp/modeling/layers/tn_expand_condense.py)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/dataset.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/dataset.py
deleted file mode 100644
index 4edf7fd193121a25b81760d20ff17e0f2ea3e662..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/dataset.py
+++ /dev/null
@@ -1,472 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""TriviaQA: A Reading Comprehension Dataset."""
-import functools
-import json
-import os
-
-from absl import logging
-import apache_beam as beam
-import six
-import tensorflow as tf
-import tensorflow_datasets.public_api as tfds
-
-from official.nlp.projects.triviaqa import preprocess
-
-_CITATION = """
-@article{2017arXivtriviaqa,
-       author = {{Joshi}, Mandar and {Choi}, Eunsol and {Weld},
-                 Daniel and {Zettlemoyer}, Luke},
-        title = "{triviaqa: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}",
-      journal = {arXiv e-prints},
-         year = 2017,
-          eid = {arXiv:1705.03551},
-        pages = {arXiv:1705.03551},
-archivePrefix = {arXiv},
-       eprint = {1705.03551},
-}
-"""
-_DOWNLOAD_URL_TMPL = (
-    "http://nlp.cs.washington.edu/triviaqa/data/triviaqa-{}.tar.gz")
-_TRAIN_FILE_FORMAT = "*-train.json"
-_VALIDATION_FILE_FORMAT = "*-dev.json"
-_TEST_FILE_FORMAT = "*test-without-answers.json"
-_WEB_EVIDENCE_DIR = "evidence/web"
-_WIKI_EVIDENCE_DIR = "evidence/wikipedia"
-
-_DESCRIPTION = """\
-TriviaqQA is a reading comprehension dataset containing over 650K
-question-answer-evidence triples. TriviaqQA includes 95K question-answer
-pairs authored by trivia enthusiasts and independently gathered evidence
-documents, six per question on average, that provide high quality distant
-supervision for answering the questions.
-"""
-
-_RC_DESCRIPTION = """\
-Question-answer pairs where all documents for a given question contain the
-answer string(s).
-"""
-
-_UNFILTERED_DESCRIPTION = """\
-110k question-answer pairs for open domain QA where not all documents for a
-given question contain the answer string(s). This makes the unfiltered dataset
-more appropriate for IR-style QA.
-"""
-
-_CONTEXT_ADDENDUM = "Includes context from Wikipedia and search results."
-
-
-def _web_evidence_dir(tmp_dir):
-  return tf.io.gfile.glob(os.path.join(tmp_dir, _WEB_EVIDENCE_DIR))
-
-
-def _wiki_evidence_dir(tmp_dir):
-  return tf.io.gfile.glob(os.path.join(tmp_dir, _WIKI_EVIDENCE_DIR))
-
-
-class TriviaQAConfig(tfds.core.BuilderConfig):
-  """BuilderConfig for TriviaQA."""
-
-  def __init__(self, *, unfiltered=False, exclude_context=False, **kwargs):
-    """BuilderConfig for TriviaQA.
-
-    Args:
-      unfiltered: bool, whether to use the unfiltered version of the dataset,
-        intended for open-domain QA.
-      exclude_context: bool, whether to exclude Wikipedia and search context for
-        reduced size.
-      **kwargs: keyword arguments forwarded to super.
-    """
-    name = "unfiltered" if unfiltered else "rc"
-    if exclude_context:
-      name += ".nocontext"
-    description = _UNFILTERED_DESCRIPTION if unfiltered else _RC_DESCRIPTION
-    if not exclude_context:
-      description += _CONTEXT_ADDENDUM
-    super(TriviaQAConfig, self).__init__(
-        name=name,
-        description=description,
-        version=tfds.core.Version("1.1.1"),
-        **kwargs)
-    self.unfiltered = unfiltered
-    self.exclude_context = exclude_context
-
-
-class BigBirdTriviaQAConfig(tfds.core.BuilderConfig):
-  """BuilderConfig for TriviaQA."""
-
-  def __init__(self, **kwargs):
-    """BuilderConfig for TriviaQA.
-
-    Args:
-      **kwargs: keyword arguments forwarded to super.
-    """
-    name = "rc_wiki.preprocessed"
-    description = _RC_DESCRIPTION
-    super(BigBirdTriviaQAConfig, self).__init__(
-        name=name,
-        description=description,
-        version=tfds.core.Version("1.1.1"),
-        **kwargs)
-    self.unfiltered = False
-    self.exclude_context = False
-
-  def configure(self,
-                sentencepiece_model_path,
-                sequence_length,
-                stride,
-                global_sequence_length=None):
-    """Configures additional user-specified arguments."""
-    self.sentencepiece_model_path = sentencepiece_model_path
-    self.sequence_length = sequence_length
-    self.stride = stride
-    if global_sequence_length is None and sequence_length is not None:
-      self.global_sequence_length = sequence_length // 16 + 64
-    else:
-      self.global_sequence_length = global_sequence_length
-    logging.info(
-        """
-        global_sequence_length: %s
-        sequence_length: %s
-        stride: %s
-        sentencepiece_model_path: %s""",
-        self.global_sequence_length, self.sequence_length,
-        self.stride, self.sentencepiece_model_path)
-
-  def validate(self):
-    """Validates that user specifies valid arguments."""
-    if self.sequence_length is None:
-      raise ValueError("sequence_length must be specified for BigBird.")
-    if self.stride is None:
-      raise ValueError("stride must be specified for BigBird.")
-    if self.sentencepiece_model_path is None:
-      raise ValueError(
-          "sentencepiece_model_path must be specified for BigBird.")
-
-
-def filter_files_for_big_bird(files):
-  filtered_files = [f for f in files if os.path.basename(f).startswith("wiki")]
-  assert len(filtered_files) == 1, "There should only be one wikipedia file."
-  return filtered_files
-
-
-class TriviaQA(tfds.core.BeamBasedBuilder):
-  """TriviaQA is a reading comprehension dataset.
-
-  It containss over 650K question-answer-evidence triples.
-  """
-  name = "bigbird_trivia_qa"
-  BUILDER_CONFIGS = [
-      BigBirdTriviaQAConfig(),
-      TriviaQAConfig(unfiltered=False, exclude_context=False),  # rc
-      TriviaQAConfig(unfiltered=False, exclude_context=True),  # rc.nocontext
-      TriviaQAConfig(unfiltered=True, exclude_context=False),  # unfiltered
-      TriviaQAConfig(unfiltered=True, exclude_context=True),
-      # unfilered.nocontext
-  ]
-
-  def __init__(self,
-               *,
-               sentencepiece_model_path=None,
-               sequence_length=None,
-               stride=None,
-               global_sequence_length=None,
-               **kwargs):
-    super(TriviaQA, self).__init__(**kwargs)
-    if isinstance(self.builder_config, BigBirdTriviaQAConfig):
-      self.builder_config.configure(
-          sentencepiece_model_path=sentencepiece_model_path,
-          sequence_length=sequence_length,
-          stride=stride,
-          global_sequence_length=global_sequence_length)
-
-  def _info(self):
-    if isinstance(self.builder_config, BigBirdTriviaQAConfig):
-      return tfds.core.DatasetInfo(
-          builder=self,
-          description=_DESCRIPTION,
-          supervised_keys=None,
-          homepage="http://nlp.cs.washington.edu/triviaqa/",
-          citation=_CITATION,
-          features=tfds.features.FeaturesDict({
-              "id": tfds.features.Text(),
-              "qid": tfds.features.Text(),
-              "question": tfds.features.Text(),
-              "context": tfds.features.Text(),
-              # Sequence features.
-              "token_ids": tfds.features.Tensor(shape=(None,), dtype=tf.int64),
-              "token_offsets":
-                  tfds.features.Tensor(shape=(None,), dtype=tf.int64),
-              "segment_ids":
-                  tfds.features.Tensor(shape=(None,), dtype=tf.int64),
-              "global_token_ids":
-                  tfds.features.Tensor(shape=(None,), dtype=tf.int64),
-              # Start and end indices (inclusive).
-              "answers":
-                  tfds.features.Tensor(shape=(None, 2), dtype=tf.int64),
-          }))
-
-    return tfds.core.DatasetInfo(
-        builder=self,
-        description=_DESCRIPTION,
-        features=tfds.features.FeaturesDict({
-            "question":
-                tfds.features.Text(),
-            "question_id":
-                tfds.features.Text(),
-            "question_source":
-                tfds.features.Text(),
-            "entity_pages":
-                tfds.features.Sequence({
-                    "doc_source":
-                        tfds.features.Text(),
-                    "filename":
-                        tfds.features.Text(),
-                    "title":
-                        tfds.features.Text(),
-                    "wiki_context":
-                        tfds.features.Text(),
-                }),
-            "search_results":
-                tfds.features.Sequence({
-                    "description":
-                        tfds.features.Text(),
-                    "filename":
-                        tfds.features.Text(),
-                    "rank":
-                        tf.int32,
-                    "title":
-                        tfds.features.Text(),
-                    "url":
-                        tfds.features.Text(),
-                    "search_context":
-                        tfds.features.Text(),
-                }),
-            "answer":
-                tfds.features.FeaturesDict({
-                    "aliases":
-                        tfds.features.Sequence(tfds.features.Text()),
-                    "normalized_aliases":
-                        tfds.features.Sequence(tfds.features.Text()),
-                    "matched_wiki_entity_name":
-                        tfds.features.Text(),
-                    "normalized_matched_wiki_entity_name":
-                        tfds.features.Text(),
-                    "normalized_value":
-                        tfds.features.Text(),
-                    "type":
-                        tfds.features.Text(),
-                    "value":
-                        tfds.features.Text(),
-                }),
-        }),
-
-        supervised_keys=None,
-        homepage="http://nlp.cs.washington.edu/triviaqa/",
-        citation=_CITATION,
-    )
-
-  def _split_generators(self, dl_manager):
-    """Returns SplitGenerators."""
-    cfg = self.builder_config
-    download_urls = dict()
-    if not (cfg.unfiltered and cfg.exclude_context):
-      download_urls["rc"] = _DOWNLOAD_URL_TMPL.format("rc")
-    if cfg.unfiltered:
-      download_urls["unfiltered"] = _DOWNLOAD_URL_TMPL.format("unfiltered")
-    file_paths = dl_manager.download_and_extract(download_urls)
-
-    qa_dir = (
-        os.path.join(file_paths["unfiltered"], "triviaqa-unfiltered")
-        if cfg.unfiltered else
-        os.path.join(file_paths["rc"], "qa"))
-    train_files = tf.io.gfile.glob(os.path.join(qa_dir, _TRAIN_FILE_FORMAT))
-    valid_files = tf.io.gfile.glob(
-        os.path.join(qa_dir, _VALIDATION_FILE_FORMAT))
-    test_files = tf.io.gfile.glob(os.path.join(qa_dir, _TEST_FILE_FORMAT))
-
-    if cfg.exclude_context:
-      web_evidence_dir = None
-      wiki_evidence_dir = None
-    else:
-      web_evidence_dir = os.path.join(file_paths["rc"], _WEB_EVIDENCE_DIR)
-      wiki_evidence_dir = os.path.join(file_paths["rc"], _WIKI_EVIDENCE_DIR)
-
-    if isinstance(cfg, BigBirdTriviaQAConfig):
-      train_files = filter_files_for_big_bird(train_files)
-      valid_files = filter_files_for_big_bird(valid_files)
-      test_files = filter_files_for_big_bird(test_files)
-
-    return [
-        tfds.core.SplitGenerator(
-            name=tfds.Split.TRAIN,
-            gen_kwargs={"files": train_files,
-                        "web_dir": web_evidence_dir,
-                        "wiki_dir": wiki_evidence_dir,
-                        "answer": True}),
-        tfds.core.SplitGenerator(
-            name=tfds.Split.VALIDATION,
-            gen_kwargs={"files": valid_files,
-                        "web_dir": web_evidence_dir,
-                        "wiki_dir": wiki_evidence_dir,
-                        "answer": True}),
-        tfds.core.SplitGenerator(
-            name=tfds.Split.TEST,
-            gen_kwargs={"files": test_files,
-                        "web_dir": web_evidence_dir,
-                        "wiki_dir": wiki_evidence_dir,
-                        "answer": False}),
-    ]
-
-  def _build_pcollection(self, pipeline, files, web_dir, wiki_dir, answer):
-    if isinstance(self.builder_config, BigBirdTriviaQAConfig):
-      self.builder_config.validate()
-      question_answers = preprocess.read_question_answers(files[0])
-      return preprocess.make_pipeline(
-          pipeline,
-          question_answers=question_answers,
-          answer=answer,
-          max_num_tokens=self.builder_config.sequence_length,
-          max_num_global_tokens=self.builder_config.global_sequence_length,
-          stride=self.builder_config.stride,
-          sentencepiece_model_path=self.builder_config.sentencepiece_model_path,
-          wikipedia_dir=wiki_dir,
-          web_dir=web_dir)
-
-    parse_example_fn = functools.partial(parse_example,
-                                         self.builder_config.exclude_context,
-                                         web_dir, wiki_dir)
-    return (pipeline
-            | beam.Create(files)
-            | beam.ParDo(ReadQuestions())
-            | beam.Reshuffle()
-            | beam.Map(parse_example_fn))
-
-
-class ReadQuestions(beam.DoFn):
-  """Read questions from JSON."""
-
-  def process(self, file):
-    with tf.io.gfile.GFile(file) as f:
-      data = json.load(f)
-    for question in data["Data"]:
-      example = {"SourceFile": os.path.basename(file)}
-      example.update(question)
-      yield example
-
-
-def parse_example(exclude_context, web_dir, wiki_dir, article):
-  """Return a single example from an article JSON record."""
-
-  def _strip(collection):
-    return [item.strip() for item in collection]
-
-  if "Answer" in article:
-    answer = article["Answer"]
-    answer_dict = {
-        "aliases":
-            _strip(answer["Aliases"]),
-        "normalized_aliases":
-            _strip(answer["NormalizedAliases"]),
-        "matched_wiki_entity_name":
-            answer.get("MatchedWikiEntryName", "").strip(),
-        "normalized_matched_wiki_entity_name":
-            answer.get("NormalizedMatchedWikiEntryName", "").strip(),
-        "normalized_value":
-            answer["NormalizedValue"].strip(),
-        "type":
-            answer["Type"].strip(),
-        "value":
-            answer["Value"].strip(),
-    }
-  else:
-    answer_dict = {
-        "aliases": [],
-        "normalized_aliases": [],
-        "matched_wiki_entity_name": "<unk>",
-        "normalized_matched_wiki_entity_name": "<unk>",
-        "normalized_value": "<unk>",
-        "type": "",
-        "value": "<unk>",
-    }
-
-  if exclude_context:
-    article["SearchResults"] = []
-    article["EntityPages"] = []
-
-  def _add_context(collection, context_field, file_dir):
-    """Adds context from file, or skips if file does not exist."""
-    new_items = []
-    for item in collection:
-      if "Filename" not in item:
-        logging.info("Missing context 'Filename', skipping.")
-        continue
-
-      new_item = item.copy()
-      fname = item["Filename"]
-      try:
-        with tf.io.gfile.GFile(os.path.join(file_dir, fname)) as f:
-          new_item[context_field] = f.read()
-      except (IOError, tf.errors.NotFoundError):
-        logging.info("File does not exist, skipping: %s", fname)
-        continue
-      new_items.append(new_item)
-    return new_items
-
-  def _strip_if_str(v):
-    return v.strip() if isinstance(v, six.string_types) else v
-
-  def _transpose_and_strip_dicts(dicts, field_names):
-    return {
-        tfds.core.naming.camelcase_to_snakecase(k):
-        [_strip_if_str(d[k]) for d in dicts] for k in field_names
-    }
-
-  search_results = _transpose_and_strip_dicts(
-      _add_context(article.get("SearchResults", []), "SearchContext", web_dir),
-      ["Description", "Filename", "Rank", "Title", "Url", "SearchContext"])
-
-  entity_pages = _transpose_and_strip_dicts(
-      _add_context(article.get("EntityPages", []), "WikiContext", wiki_dir),
-      ["DocSource", "Filename", "Title", "WikiContext"])
-
-  question = article["Question"].strip()
-  question_id = article["QuestionId"]
-  question_source = article["QuestionSource"].strip()
-
-  return f"{article['SourceFile']}_{question_id}", {
-      "entity_pages": entity_pages,
-      "search_results": search_results,
-      "question": question,
-      "question_id": question_id,
-      "question_source": question_source,
-      "answer": answer_dict,
-  }
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/download_and_prepare.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/download_and_prepare.py
deleted file mode 100644
index 0f827c911cae742114392cdf9c87c0502c13f155..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/download_and_prepare.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Downloads and prepares TriviaQA dataset."""
-from unittest import mock
-
-from absl import app
-from absl import flags
-from absl import logging
-import apache_beam as beam
-import tensorflow_datasets as tfds
-
-from official.nlp.projects.triviaqa import dataset  # pylint: disable=unused-import
-
-flags.DEFINE_integer('sequence_length', 4096, 'Max number of tokens.')
-
-flags.DEFINE_integer(
-    'global_sequence_length', None,
-    'Max number of question tokens plus sentences. If not set, defaults to '
-    'sequence_length // 16 + 64.')
-
-flags.DEFINE_integer(
-    'stride', 3072,
-    'For documents longer than `sequence_length`, where to split them.')
-
-flags.DEFINE_string(
-    'sentencepiece_model_path', None,
-    'SentencePiece model to use for tokenization.')
-
-flags.DEFINE_string('data_dir', None, 'Data directory for TFDS.')
-
-flags.DEFINE_string('runner', 'DirectRunner', 'Beam runner to use.')
-
-FLAGS = flags.FLAGS
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError('Too many command-line arguments.')
-  builder = tfds.builder(
-      'bigbird_trivia_qa/rc_wiki.preprocessed',
-      data_dir=FLAGS.data_dir,
-      sentencepiece_model_path=FLAGS.sentencepiece_model_path,
-      sequence_length=FLAGS.sequence_length,
-      global_sequence_length=FLAGS.global_sequence_length,
-      stride=FLAGS.stride)
-  download_config = tfds.download.DownloadConfig(
-      beam_options=beam.options.pipeline_options.PipelineOptions(flags=[
-          f'--runner={FLAGS.runner}',
-          '--direct_num_workers=8',
-          '--direct_running_mode=multi_processing',
-      ]))
-  with mock.patch('tensorflow_datasets.core.download.extractor._normpath',
-                  new=lambda x: x):
-    builder.download_and_prepare(download_config=download_config)
-  logging.info(builder.info.splits)
-
-
-if __name__ == '__main__':
-  flags.mark_flag_as_required('sentencepiece_model_path')
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/evaluate.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/evaluate.py
deleted file mode 100644
index d56801443ef78d2165ce88bf07f01c234740d23f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/evaluate.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Evalutes TriviaQA predictions."""
-import json
-
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-
-from official.nlp.projects.triviaqa import evaluation
-
-flags.DEFINE_string('gold_path', None,
-                    'Path to golden validation, i.e. wikipedia-dev.json.')
-
-flags.DEFINE_string('predictions_path', None,
-                    'Path to predictions in JSON format')
-
-FLAGS = flags.FLAGS
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError('Too many command-line arguments.')
-  with tf.io.gfile.GFile(FLAGS.gold_path) as f:
-    ground_truth = {
-        datum['QuestionId']: datum['Answer'] for datum in json.load(f)['Data']
-    }
-  with tf.io.gfile.GFile(FLAGS.predictions_path) as f:
-    predictions = json.load(f)
-  logging.info(evaluation.evaluate_triviaqa(ground_truth, predictions))
-
-
-if __name__ == '__main__':
-  flags.mark_flag_as_required('predictions_path')
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/evaluation.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/evaluation.py
deleted file mode 100644
index faeda03f7d6e29c45be1140dbad6907dd2d42a78..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/evaluation.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Official evaluation script for v1.0 of the TriviaQA dataset.
-
-Forked from
-https://github.com/mandarjoshi90/triviaqa/blob/master/evaluation/triviaqa_evaluation.py.
-Modifications are removal of main function.
-"""
-import collections
-import re
-import string
-import sys
-
-
-def normalize_answer(s):
-  """Lower text and remove punctuation, articles and extra whitespace."""
-
-  def remove_articles(text):
-    return re.sub(r'\b(a|an|the)\b', ' ', text)
-
-  def white_space_fix(text):
-    return ' '.join(text.split())
-
-  def handle_punc(text):
-    exclude = set(string.punctuation + ''.join([u'鈥', u'鈥', u'麓', u'`']))
-    return ''.join(ch if ch not in exclude else ' ' for ch in text)
-
-  def lower(text):
-    return text.lower()
-
-  def replace_underscore(text):
-    return text.replace('_', ' ')
-
-  return white_space_fix(
-      remove_articles(handle_punc(lower(replace_underscore(s))))).strip()
-
-
-def f1_score(prediction, ground_truth):
-  prediction_tokens = normalize_answer(prediction).split()
-  ground_truth_tokens = normalize_answer(ground_truth).split()
-  common = (
-      collections.Counter(prediction_tokens)
-      & collections.Counter(ground_truth_tokens))
-  num_same = sum(common.values())
-  if num_same == 0:
-    return 0
-  precision = 1.0 * num_same / len(prediction_tokens)
-  recall = 1.0 * num_same / len(ground_truth_tokens)
-  f1 = (2 * precision * recall) / (precision + recall)
-  return f1
-
-
-def exact_match_score(prediction, ground_truth):
-  return normalize_answer(prediction) == normalize_answer(ground_truth)
-
-
-def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
-  scores_for_ground_truths = []
-  for ground_truth in ground_truths:
-    score = metric_fn(prediction, ground_truth)
-    scores_for_ground_truths.append(score)
-  return max(scores_for_ground_truths)
-
-
-def is_exact_match(answer_object, prediction):
-  ground_truths = get_ground_truths(answer_object)
-  for ground_truth in ground_truths:
-    if exact_match_score(prediction, ground_truth):
-      return True
-  return False
-
-
-def has_exact_match(ground_truths, candidates):
-  for ground_truth in ground_truths:
-    if ground_truth in candidates:
-      return True
-  return False
-
-
-def get_ground_truths(answer):
-  return answer['NormalizedAliases'] + [
-      normalize_answer(ans) for ans in answer.get('HumanAnswers', [])
-  ]
-
-
-def get_oracle_score(ground_truth,
-                     predicted_answers,
-                     qid_list=None,
-                     mute=False):
-  exact_match = common = 0
-  if qid_list is None:
-    qid_list = ground_truth.keys()
-  for qid in qid_list:
-    if qid not in predicted_answers:
-      if not mute:
-        message = 'Irrelavant question {} will receive score 0.'.format(qid)
-        print(message, file=sys.stderr)
-      continue
-    common += 1
-    prediction = normalize_answer(predicted_answers[qid])
-    ground_truths = get_ground_truths(ground_truth[qid])
-    em_for_this_question = has_exact_match(ground_truths, prediction)
-    exact_match += int(em_for_this_question)
-
-  exact_match = 100.0 * exact_match / len(qid_list)
-
-  return {
-      'oracle_exact_match': exact_match,
-      'common': common,
-      'denominator': len(qid_list),
-      'pred_len': len(predicted_answers),
-      'gold_len': len(ground_truth)
-  }
-
-
-def evaluate_triviaqa(ground_truth,
-                      predicted_answers,
-                      qid_list=None,
-                      mute=False):
-  f1 = exact_match = common = 0
-  if qid_list is None:
-    qid_list = ground_truth.keys()
-  for qid in qid_list:
-    if qid not in predicted_answers:
-      if not mute:
-        message = 'Missed question {} will receive score 0.'.format(qid)
-        print(message, file=sys.stderr)
-      continue
-    if qid not in ground_truth:
-      if not mute:
-        message = 'Irrelavant question {} will receive score 0.'.format(qid)
-        print(message, file=sys.stderr)
-      continue
-    common += 1
-    prediction = predicted_answers[qid]
-    ground_truths = get_ground_truths(ground_truth[qid])
-    em_for_this_question = metric_max_over_ground_truths(
-        exact_match_score, prediction, ground_truths)
-    if em_for_this_question == 0 and not mute:
-      print('em=0:', prediction, ground_truths)
-    exact_match += em_for_this_question
-    f1_for_this_question = metric_max_over_ground_truths(
-        f1_score, prediction, ground_truths)
-    f1 += f1_for_this_question
-
-  exact_match = 100.0 * exact_match / len(qid_list)
-  f1 = 100.0 * f1 / len(qid_list)
-
-  return {
-      'exact_match': exact_match,
-      'f1': f1,
-      'common': common,
-      'denominator': len(qid_list),
-      'pred_len': len(predicted_answers),
-      'gold_len': len(ground_truth)
-  }
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/inputs.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/inputs.py
deleted file mode 100644
index fbc5013dbda02a5e56925b0dd38236679dc7634c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/inputs.py
+++ /dev/null
@@ -1,564 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Input processing for TriviaQA."""
-import os
-from typing import Optional, Text, Union
-
-import tensorflow as tf
-import tensorflow_datasets as tfds
-
-from official.modeling import tf_utils
-from official.nlp.projects.triviaqa import dataset  # pylint: disable=unused-import
-
-
-def _flatten_dims(tensor: tf.Tensor,
-                  first_dim: Optional[int] = 0,
-                  last_dim: Optional[int] = -1,
-                  name: Optional[Text] = None) -> tf.Tensor:
-  """Flattens the given span of dimensions in `tensor`.
-
-  Args:
-    tensor: [..., first_dim_size, ...middle_dims..., last_dim_size, ...] shaped
-      Tensor.
-    first_dim: The first dimension to flatten (inclusive). Must be a valid index
-      for the rank of `tensor`. Default is 0.
-    last_dim: The last dimension to flatten (inclusive). Must be a valid index
-      for the rank of `tensor`. Default is -1.
-    name: A name for the operation (optional).
-
-  Returns:
-    Tensor of shape [..., flattened_dim_size, ...] where
-    flattened_dim_size = first_dim_size * ...middle_dims... * last_dim_size.
-  """
-  with tf.name_scope(name or 'flatten_dims'):
-    tensor = tf.convert_to_tensor(tensor)
-
-    rank = tensor.shape.rank
-    if rank is None:
-      raise ValueError('Static rank of `tensor` must be known.')
-    if first_dim < 0:
-      first_dim += rank
-    if first_dim < 0 or first_dim >= rank:
-      raise ValueError('`first_dim` out of bounds for `tensor` rank.')
-    if last_dim < 0:
-      last_dim += rank
-    if last_dim < 0 or last_dim >= rank:
-      raise ValueError('`last_dim` out of bounds for `tensor` rank.')
-    if first_dim > last_dim:
-      raise ValueError('`first_dim` must not be larger than `last_dim`.')
-
-    # Try to calculate static flattened dim size if all input sizes to flatten
-    # are statically known. Otherwise, just use -1.
-    flat_dims_shape = tensor.shape[first_dim:(last_dim + 1)].as_list()
-    flattened_dim_size = 1
-    for size in flat_dims_shape:
-      if size is None:
-        flattened_dim_size = -1
-        break
-      flattened_dim_size *= size
-
-    old_shape = tf.shape(tensor)
-    output_shape = tf.concat([
-        old_shape[:first_dim], [flattened_dim_size], old_shape[(last_dim + 1):]
-    ], 0)
-    return tf.reshape(tensor, output_shape)
-
-
-def _pad_to_multiple(tensor: tf.Tensor,
-                     factor: Union[int, tf.Tensor],
-                     axis: int,
-                     mode: Optional[Text] = 'CONSTANT',
-                     constant_values=0,
-                     name: Optional[Text] = None) -> tf.Tensor:
-  """Pads `tensor` on a given `axis` to be a multiple of `factor`.
-
-  Padding will be concatenated to the end of the axis only, not the beginning.
-  If the length along `axis` is already a multiple of `factor`, this is
-  effectively a no-op.
-
-  Args:
-    tensor: A Tensor with rank >= 1 to pad.
-    factor: Positive integer factor to pad for. If a Tensor, must be a scalar
-      int.
-    axis: A valid axis in `tensor` to pad.
-    mode: The padding mode to use according to `tf.pad`. Defaults to 'CONSTANT'.
-    constant_values: For 'CONSTANT' mode, the scalar pad value to use within
-      `tf.pad`. Defaults to 0. Must be same type as `tensor`.
-    name: A name for the operation (optional).
-
-  Returns:
-    The padded Tensor result.
-  """
-  with tf.name_scope(name or 'pad_to_multiple'):
-    tensor = tf.convert_to_tensor(tensor)
-
-    if isinstance(factor, int) and factor < 1:
-      raise ValueError('`factor` must be positive.')
-    rank = tensor.shape.rank
-    if rank is None:
-      raise ValueError('Static rank of `tensor` must be known.')
-    if axis < 0:
-      axis += rank
-    if axis < 0 or axis >= rank:
-      raise ValueError('`axis` out of bounds for `tensor` rank.')
-
-    axis_len = tf_utils.get_shape_list(tensor)[axis]
-    pad_len = -axis_len % factor
-    paddings = pad_len * tf.one_hot([-1, axis], rank, axis=0, dtype=tf.int32)
-    return tf.pad(
-        tensor=tensor,
-        paddings=paddings,
-        mode=mode,
-        constant_values=constant_values)
-
-
-def _skew_elements_right(tensor: tf.Tensor,
-                         axis: int,
-                         pad_value=0,
-                         name: Optional[Text] = None) -> tf.Tensor:
-  """Skews successive elements right along the given `axis`.
-
-  This changes an input like
-  [
-    [1, 2, 3],
-    [4, 5, 6],
-    [7, 8, 9]
-  ]
-  into the following:
-  [
-    [1, 2, 3, 0, 0],
-    [0, 4, 5, 6, 0],
-    [0, 0, 7, 8, 9]
-  ]
-
-  Args:
-    tensor: Tensor of shape [..., num_rows, axis_len, ...].
-    axis: A valid axis in `tensor` to skew along. It must not be the first axis
-      in `tensor`.
-    pad_value: The scalar pad value to use. Defaults to 0. Must be the same type
-      as `tensor`.
-    name: A name for the operation (optional).
-
-  Returns:
-    Tensor of shape [..., num_rows, axis_len + num_rows - 1, ...].
-  """
-  with tf.name_scope(name or 'skew_elements_right'):
-    tensor = tf.convert_to_tensor(tensor)
-
-    rank = tensor.shape.rank
-    num_rows = tf_utils.get_shape_list(tensor)[axis - 1]
-    axis_len = tf_utils.get_shape_list(tensor)[axis]
-
-    if rank is None:
-      raise ValueError('Static rank of `tensor` must be known.')
-    if axis < 0:
-      axis += rank
-    if axis <= 0 or axis >= rank:
-      raise ValueError('`axis` out of bounds for `tensor` rank.')
-
-    output_len = axis_len + num_rows - 1
-
-    paddings = num_rows * tf.one_hot([-1, axis], rank, axis=0, dtype=tf.int32)
-
-    # [..., num_rows, axis_len + num_rows, ...]
-    padded_tensor = tf.pad(tensor, paddings, constant_values=pad_value)
-
-    # [..., num_rows * (axis_len + num_rows), ...]
-    flat_tensor = _flatten_dims(
-        padded_tensor, first_dim=axis - 1, last_dim=axis)
-
-    padded_tensor2 = _pad_to_multiple(
-        flat_tensor,
-        factor=output_len,
-        axis=axis - 1,
-        constant_values=pad_value)
-
-    # [..., num_rows + 1, output_len, ...]
-    new_shape = tf.concat([
-        tf.shape(tensor)[:(axis - 1)], [num_rows + 1, output_len],
-        tf.shape(tensor)[(axis + 1):]
-    ], 0)
-    reshaped_tensor = tf.reshape(padded_tensor2, new_shape)
-
-    # [..., num_rows, output_len, ...]
-    output_shape = new_shape - tf.one_hot(axis - 1, depth=rank, dtype=tf.int32)
-    return tf.slice(
-        reshaped_tensor, begin=tf.zeros_like(output_shape), size=output_shape)
-
-
-class RelativePositionGenerator(object):
-  """Generates `relative_att_ids` for purely distance-based relative positions.
-
-  This implements the clipped relative position representations originally
-  described in https://arxiv.org/abs/1803.02155 .
-
-  Attributes:
-    max_distance: Integer passed from `__init__`.
-    ignore_direction: Bool passed from `__init__`.
-    relative_vocab_size: Integer representing the maximum number of unique ids
-      output from this generator.
-    left_pad_value: Integer id for all positions at or beyond max_distance to
-      the left.
-    right_pad_value: Integer id for all positions at or beyond max_distance to
-      the right.
-  """
-
-  def __init__(self, max_distance: int, ignore_direction: bool = False):
-    """Init.
-
-    Args:
-      max_distance: The maximum distance to represent. Must not be negative. All
-        larger distances will be clipped to this value.
-      ignore_direction: If True, both left and right position representations
-        will have the same ids based on absolute distance (resulting in
-        symmetric ids around the center token).
-    """
-    if max_distance < 0:
-      raise ValueError('`max_distance` must not be negative.')
-    self.max_distance = max_distance
-    self.ignore_direction = ignore_direction
-
-    self.right_pad_value = max_distance
-    self.left_pad_value = max_distance if ignore_direction else 2 * max_distance
-
-    # 0 is the first id, so vocab size is 1 + the largest id (left pad value).
-    self.relative_vocab_size = self.left_pad_value + 1
-
-  def make_relative_att_ids(self,
-                            seq_len: Union[int, tf.Tensor],
-                            batch_size: Optional[Union[int, tf.Tensor]] = 1,
-                            name: Optional[Text] = None) -> tf.Tensor:
-    """Makes relative position ids for full self-attention.
-
-    For example, if `max_distance` is 3, `ignore_direction` is False, `seq_len`
-    is 6, and `batch_size` is 1, the result is the following:
-      [[
-          [0, 1, 2, 3, 3, 3],
-          [4, 0, 1, 2, 3, 3],
-          [5, 4, 0, 1, 2, 3],
-          [6, 5, 4, 0, 1, 2],
-          [6, 6, 5, 4, 0, 1],
-          [6, 6, 6, 5, 4, 0],
-      ]]
-
-    Args:
-      seq_len: The sequence length to create ids for. Must be positive. If a
-        Tensor, must be a scalar int.
-      batch_size: The batch size of the result (default 1). Must be positive. If
-        a Tensor, must be a scalar int. All examples in the batch will have the
-        same id pattern.
-      name: A name for the operation (optional).
-
-    Returns:
-      <int32>[batch_size, seq_len, seq_len] Tensor of relative position ids.
-    """
-    with tf.name_scope(name or 'make_relative_att_ids'):
-      if isinstance(seq_len, int) and seq_len < 1:
-        raise ValueError('`seq_len` must be positive.')
-      if isinstance(batch_size, int) and batch_size < 1:
-        raise ValueError('`batch_size` must be positive.')
-
-      # We need the id_pattern to cover all tokens to the left of the last token
-      # and all tokens to the right of the first token at the same time.
-      window_size = 2 * seq_len - 1
-
-      # [window_size]
-      id_pattern = self._make_relative_id_pattern(window_size)
-
-      # [seq_len, window_size]
-      id_tensor = tf.tile(id_pattern[tf.newaxis, :], [seq_len, 1])
-
-      # [seq_len, window_size + seq_len - 1]
-      id_tensor = _skew_elements_right(id_tensor, -1)
-
-      # [seq_len, seq_len]
-      id_tensor = tf.slice(id_tensor, [0, seq_len - 1], [seq_len, seq_len])
-
-      return tf.tile(id_tensor[tf.newaxis, :, :], [batch_size, 1, 1])
-
-  def make_local_relative_att_ids(self,
-                                  seq_len: Union[int, tf.Tensor],
-                                  local_radius: int,
-                                  batch_size: Optional[Union[int,
-                                                             tf.Tensor]] = 1,
-                                  name: Optional[Text] = None) -> tf.Tensor:
-    """Makes relative position ids for local self-attention.
-
-    The result can be used as `relative_att_ids` in
-    `layers.RelativeLocalSelfAttention`.
-
-    For example, if `max_distance` is 3, `ignore_direction` is False, `seq_len`
-    is 4, `local_radius` is 5, and `batch_size` is 1, the result is the
-    following:
-      [[
-          [6, 6, 6, 5, 4, 0, 1, 2, 3, 3, 3],
-          [6, 6, 6, 5, 4, 0, 1, 2, 3, 3, 3],
-          [6, 6, 6, 5, 4, 0, 1, 2, 3, 3, 3],
-          [6, 6, 6, 5, 4, 0, 1, 2, 3, 3, 3],
-      ]]
-
-    Args:
-      seq_len: The sequence length to create ids for. Must be positive. If a
-        Tensor, must be a scalar int.
-      local_radius: The local radius as expected by
-        `layers.RelativeLocalSelfAttention`. Must be positive.
-      batch_size: The batch size of the result (default 1). Must be positive. If
-        a Tensor, must be a scalar int. All examples in the batch will have the
-        same id pattern.
-      name: A name for the operation (optional).
-
-    Returns:
-      <int32>[batch_size, seq_len, 2*local_radius + 1] Tensor of relative
-      position ids.
-    """
-    with tf.name_scope(name or 'make_local_relative_att_ids'):
-      if isinstance(seq_len, int) and seq_len < 1:
-        raise ValueError('`seq_len` must be positive.')
-      if local_radius < 1:
-        raise ValueError('`local_radius` must be positive.')
-      if isinstance(batch_size, int) and batch_size < 1:
-        raise ValueError('`batch_size` must be positive.')
-
-      window_size = 2 * local_radius + 1
-
-      # [window_size]
-      id_pattern = self._make_relative_id_pattern(window_size)
-
-      return tf.tile(id_pattern[tf.newaxis, tf.newaxis, :],
-                     [batch_size, seq_len, 1])
-
-  def _make_relative_id_pattern(
-      self, window_size: Union[int, tf.Tensor]) -> tf.Tensor:
-    """Helper for making the relative id pattern for a particular window size.
-
-    For example, if `max_distance` is 3, `ignore_direction` is False, and
-    `window_size` is 11, the result is the following:
-    [6, 6, 6, 5, 4, 0, 1, 2, 3, 3, 3].
-
-    Args:
-      window_size: Window size to return relative ids for. Must be positive and
-        odd since ids will be relative to the center of the window. If a Tensor,
-        must be a scalar int.
-
-    Returns:
-      <int32>[window_size] Tensor of relative position ids.
-    """
-    if isinstance(window_size, int):
-      if window_size < 1:
-        raise ValueError('`window_size` must be positive.')
-      if window_size % 2 != 1:
-        raise ValueError('`window_size` must be odd.')
-
-    x = tf.range(self.max_distance + 1, dtype=tf.int32)
-    x = tf.pad(x, [[self.max_distance, 0]], mode='REFLECT')
-    if not self.ignore_direction:
-      direction_adder = tf.concat([
-          tf.fill([self.max_distance], self.max_distance),
-          tf.zeros([self.max_distance + 1], dtype=tf.int32)
-      ], 0)
-      x += direction_adder
-
-    len_x = x.shape.as_list()[0]
-    if len_x > window_size:
-      trim_amount = (len_x - window_size) // 2
-      return x[trim_amount:-trim_amount]
-
-    pad_amount = (window_size - len_x) // 2
-    result = tf.pad(x, [[pad_amount, 0]], constant_values=self.left_pad_value)
-    result = tf.pad(
-        result, [[0, pad_amount]], constant_values=self.right_pad_value)
-    return result
-
-
-def read_batches(data_dir,
-                 split,
-                 batch_size,
-                 include_answers=True,
-                 shuffle=False,
-                 drop_final_batch=False,
-                 compression_type=''):
-  """Read TriviaQA batches."""
-  features = {
-      'id': tf.io.FixedLenFeature([], tf.string),
-      'qid': tf.io.FixedLenFeature([], tf.string),
-      'context': tf.io.FixedLenFeature([], tf.string),
-      'question': tf.io.FixedLenFeature([], tf.string),
-      'global_token_ids': tf.io.RaggedFeature(tf.int64),
-      'token_ids': tf.io.RaggedFeature(tf.int64),
-      'segment_ids': tf.io.RaggedFeature(tf.int64),
-      'token_offsets': tf.io.RaggedFeature(tf.int64),
-  }
-  if include_answers:
-    features['answers'] = tf.io.RaggedFeature(
-        tf.int64, partitions=(tf.io.RaggedFeature.UniformRowLength(2),))  # pytype: disable=attribute-error
-
-  dataset_builder = tfds.builder(
-      'bigbird_trivia_qa/rc_wiki.preprocessed', data_dir=data_dir)
-  split_info = dataset_builder.info.splits[split]
-  return tf.data.experimental.make_batched_features_dataset(
-      [
-          os.path.join(dataset_builder.data_dir, filename)
-          for filename in split_info.filenames
-      ],
-      batch_size=batch_size,
-      features=features,
-      reader=lambda path: tf.data.TFRecordDataset(path, compression_type),
-      label_key='answers' if include_answers else None,
-      num_epochs=1,
-      shuffle=shuffle,
-      shuffle_buffer_size=split_info.num_examples,
-      prefetch_buffer_size=tf.data.experimental.AUTOTUNE,
-      sloppy_ordering=True,
-      drop_final_batch=drop_final_batch,
-      reader_num_threads=8,
-      parser_num_threads=16)
-
-
-def scatter_labels(labels, batch_size, sequence_length):
-  """Create one hot labels."""
-  row_ids = labels.value_rowids()
-  indices = tf.concat(
-      (tf.stack((row_ids, tf.cast(labels.flat_values[:, 0],
-                                  tf.int32), tf.zeros_like(row_ids)), -1),
-       tf.stack((row_ids, tf.cast(labels.flat_values[:, 1],
-                                  tf.int32), tf.ones_like(row_ids)), -1)), 0)
-  one_hot_labels = tf.scatter_nd(indices,
-                                 tf.ones(tf.shape(indices)[0], tf.float32),
-                                 (batch_size, sequence_length, 2))
-  return tf.minimum(one_hot_labels, 1.)
-
-
-def features_map_fn(features, local_radius, relative_pos_max_distance,
-                    use_hard_g2l_mask, padding_id, eos_id, null_id, cls_id,
-                    sep_id, sequence_length, global_sequence_length):
-  """Make features."""
-  batch_size = tf.get_static_value(features['token_ids'].shape[0])
-  # sequence_lengths = features['token_ids'].row_lengths()
-  question_lengths = tf.argmax(
-      tf.equal(features['token_ids'].to_tensor(
-          shape=(batch_size, global_sequence_length)), sep_id), -1) + 1
-  mapped_features = dict(
-      token_ids=tf.cast(
-          features['token_ids'].to_tensor(shape=(batch_size, sequence_length)),
-          tf.int32),
-      global_token_ids=tf.cast(
-          features['global_token_ids'].to_tensor(
-              shape=(batch_size, global_sequence_length)), tf.int32),
-      segment_ids=tf.cast(
-          features['segment_ids'].to_tensor(
-              shape=(batch_size, sequence_length)), tf.int32),
-  )
-  relative_pos_generator = RelativePositionGenerator(
-      max_distance=relative_pos_max_distance)
-  # Only do long-to-long attention for non-null tokens.
-  # Let the null token attend to itself.
-  l2l_att_mask = tf.ones((batch_size, sequence_length, 2 * local_radius + 1),
-                         tf.int32)
-  l2l_att_mask *= 1 - tf.cast(
-      tf.logical_or(
-          tf.equal(mapped_features['token_ids'], padding_id),
-          tf.equal(mapped_features['token_ids'], null_id)),
-      tf.int32)[:, :, tf.newaxis]
-  l2l_relative_att_ids = relative_pos_generator.make_local_relative_att_ids(
-      seq_len=sequence_length, local_radius=local_radius, batch_size=batch_size)
-  #
-  l2g_att_mask = tf.ones((batch_size, sequence_length, global_sequence_length),
-                         tf.int32)
-  l2g_att_mask *= tf.cast(
-      tf.not_equal(mapped_features['token_ids'], padding_id),
-      tf.int32)[:, :, tf.newaxis]
-  l2g_att_mask *= tf.cast(
-      tf.not_equal(mapped_features['global_token_ids'], padding_id),
-      tf.int32)[:, tf.newaxis, :]
-  l2g_relative_att_ids = tf.fill(
-      (batch_size, sequence_length, global_sequence_length),
-      relative_pos_generator.relative_vocab_size + 1)
-  #
-  g2g_att_mask = tf.ones(
-      (batch_size, global_sequence_length, global_sequence_length), tf.int32)
-  g2g_att_mask *= tf.cast(
-      tf.not_equal(mapped_features['global_token_ids'], padding_id),
-      tf.int32)[:, :, tf.newaxis]
-  g2g_relative_att_ids = relative_pos_generator.make_relative_att_ids(
-      seq_len=global_sequence_length, batch_size=batch_size)
-  global_sentence_mask = tf.equal(mapped_features['global_token_ids'], eos_id)
-  global_question_mask = tf.logical_not(
-      tf.logical_or(
-          tf.logical_or(
-              tf.equal(mapped_features['global_token_ids'], cls_id),
-              tf.equal(mapped_features['global_token_ids'], eos_id)),
-          tf.equal(mapped_features['global_token_ids'], padding_id)))
-  g2g_question_mask = tf.logical_and(global_question_mask[:, tf.newaxis, :],
-                                     global_question_mask[:, :, tf.newaxis])
-  g2g_sentence_mask = tf.logical_and(global_sentence_mask[:, tf.newaxis, :],
-                                     global_sentence_mask[:, :, tf.newaxis])
-  g2g_local_mask = tf.cast(
-      tf.logical_or(g2g_question_mask, g2g_sentence_mask), tf.int32)
-  g2g_relative_att_ids *= g2g_local_mask
-  g2g_relative_att_ids += (1 - g2g_local_mask) * (
-      relative_pos_generator.relative_vocab_size + 2)
-  #
-  g2l_att_mask = tf.transpose(l2g_att_mask, [0, 2, 1])
-  if use_hard_g2l_mask:
-    global_range = tf.range(
-        global_sequence_length, dtype=mapped_features['global_token_ids'].dtype)
-    g2l_att_mask *= tf.cast(
-        tf.logical_or(
-            tf.equal(
-                mapped_features['global_token_ids'], cls_id)[:, :, tf.newaxis],
-            tf.equal(global_range[tf.newaxis, :, tf.newaxis],
-                     mapped_features['segment_ids'][:, tf.newaxis, :])),
-        tf.int32)
-  g2l_relative_att_ids = tf.transpose(l2g_relative_att_ids, [0, 2, 1])
-  mapped_features.update(
-      dict(
-          l2l_att_mask=l2l_att_mask,
-          l2l_relative_att_ids=l2l_relative_att_ids,
-          l2g_att_mask=l2g_att_mask,
-          l2g_relative_att_ids=l2g_relative_att_ids,
-          g2g_att_mask=g2g_att_mask,
-          g2g_relative_att_ids=g2g_relative_att_ids,
-          g2l_att_mask=g2l_att_mask,
-          g2l_relative_att_ids=g2l_relative_att_ids,
-          question_lengths=question_lengths,
-      ))
-  return mapped_features
-
-
-def labels_map_fn(token_ids, labels, sequence_length):
-  batch_size = tf.get_static_value(labels.shape[0])
-  row_lengths = labels.row_lengths()
-  empty_token_index = token_ids.row_lengths() - 1
-  one_hot_labels = scatter_labels(labels, batch_size, sequence_length)
-  one_hot_labels += (tf.cast(row_lengths == 0, tf.float32)[:, tf.newaxis] *
-                     tf.one_hot(empty_token_index, sequence_length))[:, :,
-                                                                     tf.newaxis]
-  return one_hot_labels
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/modeling.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/modeling.py
deleted file mode 100644
index 9f64b2141f44ba6966d8283f40bb1746471e52af..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/modeling.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Modeling for TriviaQA."""
-import tensorflow as tf
-
-from official.modeling import tf_utils
-from official.nlp.configs import encoders
-
-
-class TriviaQaHead(tf.keras.layers.Layer):
-  """Computes logits given token and global embeddings."""
-
-  def __init__(self,
-               intermediate_size,
-               intermediate_activation=tf_utils.get_activation('gelu'),
-               dropout_rate=0.0,
-               attention_dropout_rate=0.0,
-               **kwargs):
-    super(TriviaQaHead, self).__init__(**kwargs)
-    self._attention_dropout = tf.keras.layers.Dropout(attention_dropout_rate)
-    self._intermediate_dense = tf.keras.layers.Dense(intermediate_size)
-    self._intermediate_activation = tf.keras.layers.Activation(
-        intermediate_activation)
-    self._output_dropout = tf.keras.layers.Dropout(dropout_rate)
-    self._output_layer_norm = tf.keras.layers.LayerNormalization()
-    self._logits_dense = tf.keras.layers.Dense(2)
-
-  def build(self, input_shape):
-    output_shape = input_shape['token_embeddings'][-1]
-    self._output_dense = tf.keras.layers.Dense(output_shape)
-    super(TriviaQaHead, self).build(input_shape)
-
-  def call(self, inputs, training=None):
-    token_embeddings = inputs['token_embeddings']
-    token_ids = inputs['token_ids']
-    question_lengths = inputs['question_lengths']
-    x = self._attention_dropout(token_embeddings, training=training)
-    intermediate_outputs = self._intermediate_dense(x)
-    intermediate_outputs = self._intermediate_activation(intermediate_outputs)
-    outputs = self._output_dense(intermediate_outputs)
-    outputs = self._output_dropout(outputs, training=training)
-    outputs = self._output_layer_norm(outputs + token_embeddings)
-    logits = self._logits_dense(outputs)
-    logits -= tf.expand_dims(
-        tf.cast(tf.equal(token_ids, 0), tf.float32) + tf.sequence_mask(
-            question_lengths, logits.shape[-2], dtype=tf.float32), -1) * 1e6
-    return logits
-
-
-class TriviaQaModel(tf.keras.Model):
-  """Model for TriviaQA."""
-
-  def __init__(self, model_config: encoders.EncoderConfig, sequence_length: int,
-               **kwargs):
-    inputs = dict(
-        token_ids=tf.keras.Input((sequence_length,), dtype=tf.int32),
-        question_lengths=tf.keras.Input((), dtype=tf.int32))
-    encoder = encoders.build_encoder(model_config)
-    x = encoder(
-        dict(
-            input_word_ids=inputs['token_ids'],
-            input_mask=tf.cast(inputs['token_ids'] > 0, tf.int32),
-            input_type_ids=1 -
-            tf.sequence_mask(inputs['question_lengths'], sequence_length,
-                             tf.int32)))['sequence_output']
-    logits = TriviaQaHead(
-        model_config.get().intermediate_size,
-        dropout_rate=model_config.get().dropout_rate,
-        attention_dropout_rate=model_config.get().attention_dropout_rate)(
-            dict(
-                token_embeddings=x,
-                token_ids=inputs['token_ids'],
-                question_lengths=inputs['question_lengths']))
-    super(TriviaQaModel, self).__init__(inputs, logits, **kwargs)
-    self._encoder = encoder
-
-  @property
-  def encoder(self):
-    return self._encoder
-
-
-class SpanOrCrossEntropyLoss(tf.keras.losses.Loss):
-  """Cross entropy loss for multiple correct answers.
-
-  See https://arxiv.org/abs/1710.10723.
-  """
-
-  def call(self, y_true, y_pred):
-    y_pred_masked = y_pred - tf.cast(y_true < 0.5, tf.float32) * 1e6
-    or_cross_entropy = (
-        tf.math.reduce_logsumexp(y_pred, axis=-2) -
-        tf.math.reduce_logsumexp(y_pred_masked, axis=-2))
-    return tf.math.reduce_sum(or_cross_entropy, -1)
-
-
-def smooth_labels(label_smoothing, labels, question_lengths, token_ids):
-  mask = 1. - (
-      tf.cast(tf.equal(token_ids, 0), tf.float32) +
-      tf.sequence_mask(question_lengths, labels.shape[-2], dtype=tf.float32))
-  num_classes = tf.expand_dims(tf.math.reduce_sum(mask, -1, keepdims=True), -1)
-  labels = (1. - label_smoothing) * labels + (label_smoothing / num_classes)
-  return labels * tf.expand_dims(mask, -1)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/predict.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/predict.py
deleted file mode 100644
index 750e8fb020ae19304c569d800bd6fcef3984a30d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/predict.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""TriviaQA script for inference."""
-import collections
-import contextlib
-import functools
-import json
-import operator
-
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-import tensorflow_datasets as tfds
-
-import sentencepiece as spm
-from official.nlp.configs import encoders  # pylint: disable=unused-import
-from official.nlp.projects.triviaqa import evaluation
-from official.nlp.projects.triviaqa import inputs
-from official.nlp.projects.triviaqa import prediction
-
-flags.DEFINE_string('data_dir', None, 'TensorFlow Datasets directory.')
-
-flags.DEFINE_enum('split', None,
-                  [tfds.Split.TRAIN, tfds.Split.VALIDATION, tfds.Split.TEST],
-                  'For which split to generate predictions.')
-
-flags.DEFINE_string('predictions_path', None, 'Output for predictions.')
-
-flags.DEFINE_string('sentencepiece_model_path', None,
-                    'Path to sentence piece model.')
-
-flags.DEFINE_integer('bigbird_block_size', 64,
-                     'Size of blocks for sparse block attention.')
-
-flags.DEFINE_string('saved_model_dir', None,
-                    'Path from which to initialize model and weights.')
-
-flags.DEFINE_integer('sequence_length', 4096, 'Maximum number of tokens.')
-
-flags.DEFINE_integer('global_sequence_length', 320,
-                     'Maximum number of global tokens.')
-
-flags.DEFINE_integer('batch_size', 32, 'Size of batch.')
-
-flags.DEFINE_string('master', '', 'Address of the TPU master.')
-
-flags.DEFINE_integer('decode_top_k', 8,
-                     'Maximum number of tokens to consider for begin/end.')
-
-flags.DEFINE_integer('decode_max_size', 16,
-                     'Maximum number of sentence pieces in an answer.')
-
-FLAGS = flags.FLAGS
-
-
-@contextlib.contextmanager
-def worker_context():
-  if FLAGS.master:
-    with tf.device('/job:worker') as d:
-      yield d
-  else:
-    yield
-
-
-def read_sentencepiece_model(path):
-  with tf.io.gfile.GFile(path, 'rb') as file:
-    processor = spm.SentencePieceProcessor()
-    processor.LoadFromSerializedProto(file.read())
-  return processor
-
-
-def predict(sp_processor, features_map_fn, logits_fn, decode_logits_fn,
-            split_and_pad_fn, distribute_strategy, dataset):
-  """Make predictions."""
-  predictions = collections.defaultdict(list)
-  for _, features in dataset.enumerate():
-    token_ids = features['token_ids']
-    x = split_and_pad_fn(features_map_fn(features))
-    logits = tf.concat(
-        distribute_strategy.experimental_local_results(logits_fn(x)), 0)
-    logits = logits[:features['token_ids'].shape[0]]
-    end_limit = token_ids.row_lengths() - 1  # inclusive
-    begin, end, scores = decode_logits_fn(logits, end_limit)
-    answers = prediction.decode_answer(features['context'], begin, end,
-                                       features['token_offsets'],
-                                       end_limit).numpy()
-    for j, (qid, token_id, offset, score, answer) in enumerate(
-        zip(features['qid'].numpy(),
-            tf.gather(features['token_ids'], begin, batch_dims=1).numpy(),
-            tf.gather(features['token_offsets'], begin, batch_dims=1).numpy(),
-            scores, answers)):
-      if not answer:
-        logging.info('%s: %s | NO_ANSWER, %f',
-                     features['id'][j].numpy().decode('utf-8'),
-                     features['question'][j].numpy().decode('utf-8'), score)
-        continue
-      if sp_processor.IdToPiece(int(token_id)).startswith('鈻') and offset > 0:
-        answer = answer[1:]
-      logging.info('%s: %s | %s, %f', features['id'][j].numpy().decode('utf-8'),
-                   features['question'][j].numpy().decode('utf-8'),
-                   answer.decode('utf-8'), score)
-      predictions[qid.decode('utf-8')].append((score, answer.decode('utf-8')))
-  predictions = {
-      qid: evaluation.normalize_answer(
-          sorted(answers, key=operator.itemgetter(0), reverse=True)[0][1])
-      for qid, answers in predictions.items()
-  }
-  return predictions
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError('Too many command-line arguments.')
-  # Configure input processing.
-  sp_processor = read_sentencepiece_model(FLAGS.sentencepiece_model_path)
-  features_map_fn = tf.function(
-      functools.partial(
-          inputs.features_map_fn,
-          local_radius=FLAGS.bigbird_block_size,
-          relative_pos_max_distance=24,
-          use_hard_g2l_mask=True,
-          sequence_length=FLAGS.sequence_length,
-          global_sequence_length=FLAGS.global_sequence_length,
-          padding_id=sp_processor.PieceToId('<pad>'),
-          eos_id=sp_processor.PieceToId('</s>'),
-          null_id=sp_processor.PieceToId('<empty>'),
-          cls_id=sp_processor.PieceToId('<ans>'),
-          sep_id=sp_processor.PieceToId('<sep_0>')),
-      autograph=False)
-  # Connect to TPU cluster.
-  if FLAGS.master:
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(FLAGS.master)
-    tf.config.experimental_connect_to_cluster(resolver)
-    tf.tpu.experimental.initialize_tpu_system(resolver)
-    strategy = tf.distribute.TPUStrategy(resolver)
-  else:
-    strategy = tf.distribute.MirroredStrategy()
-  # Initialize datasets.
-  with worker_context():
-    _ = tf.random.get_global_generator()
-    dataset = inputs.read_batches(
-        FLAGS.data_dir, FLAGS.split, FLAGS.batch_size, include_answers=False)
-  # Initialize model and compile.
-  with strategy.scope():
-    model = tf.keras.models.load_model(FLAGS.saved_model_dir, compile=False)
-  logging.info('Model initialized. Beginning prediction loop.')
-  logits_fn = tf.function(
-      functools.partial(prediction.distributed_logits_fn, model))
-  decode_logits_fn = tf.function(
-      functools.partial(prediction.decode_logits, FLAGS.decode_top_k,
-                        FLAGS.decode_max_size))
-  split_and_pad_fn = tf.function(
-      functools.partial(prediction.split_and_pad, strategy, FLAGS.batch_size))
-  # Prediction strategy.
-  predict_fn = functools.partial(
-      predict,
-      sp_processor=sp_processor,
-      features_map_fn=features_map_fn,
-      logits_fn=logits_fn,
-      decode_logits_fn=decode_logits_fn,
-      split_and_pad_fn=split_and_pad_fn,
-      distribute_strategy=strategy,
-      dataset=dataset)
-  with worker_context():
-    predictions = predict_fn()
-  with tf.io.gfile.GFile(FLAGS.predictions_path, 'w') as f:
-    json.dump(predictions, f)
-
-
-if __name__ == '__main__':
-  flags.mark_flags_as_required(['split', 'predictions_path', 'saved_model_dir'])
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/prediction.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/prediction.py
deleted file mode 100644
index 95de9e3af1e946c496a0573b0544b729288db6f5..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/prediction.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Functions for inference."""
-import tensorflow as tf
-
-
-def split_and_pad(strategy, batch_size, x):
-  """Split and pad for interence."""
-  per_replica_size = batch_size // strategy.num_replicas_in_sync
-
-  def slice_fn(x, i):
-    begin = min(x.shape[0], i * per_replica_size)
-    end = min(x.shape[0], (i + 1) * per_replica_size)
-    indices = tf.range(begin, end, dtype=tf.int32)
-    return tf.gather(x, tf.pad(indices, [[0, per_replica_size - end + begin]]))
-
-  # pylint: disable=g-long-lambda
-  return tf.nest.map_structure(
-      lambda x: strategy.experimental_distribute_values_from_function(
-          lambda ctx: slice_fn(x, ctx.replica_id_in_sync_group)), x)
-  # pylint: enable=g-long-lambda
-
-
-def decode_logits(top_k, max_size, logits, default):
-  """Get the span from logits."""
-  logits = tf.transpose(logits, [0, 2, 1])
-  values, indices = tf.math.top_k(logits, top_k)
-  width = (
-      tf.expand_dims(indices[:, 1, :], -2) -
-      tf.expand_dims(indices[:, 0, :], -1))
-  mask = tf.logical_and(width >= 0, width <= max_size)
-  scores = (
-      tf.expand_dims(values[:, 0, :], -1) + tf.expand_dims(values[:, 1, :], -2))
-  scores = tf.where(mask, scores, -1e8)
-  flat_indices = tf.argmax(tf.reshape(scores, (-1, top_k * top_k)), -1)
-  begin = tf.gather(
-      indices[:, 0, :], tf.math.floordiv(flat_indices, top_k), batch_dims=1)
-  end = tf.gather(
-      indices[:, 1, :], tf.math.mod(flat_indices, top_k), batch_dims=1)
-  reduced_mask = tf.math.reduce_any(mask, [-1, -2])
-  return (tf.where(reduced_mask, begin,
-                   default), tf.where(reduced_mask, end, default),
-          tf.math.reduce_max(scores, [-1, -2]))
-
-
-@tf.function
-def decode_answer(context, begin, end, token_offsets, end_limit):
-  i = tf.gather(token_offsets, begin, batch_dims=1)
-  j = tf.gather(token_offsets, tf.minimum(end + 1, end_limit), batch_dims=1)
-  j = tf.where(end == end_limit, tf.cast(tf.strings.length(context), tf.int64),
-               j)
-  return tf.strings.substr(context, i, j - i)
-
-
-def distributed_logits_fn(model, x):
-  return model.distribute_strategy.run(
-      lambda x: model(x, training=False), args=(x,))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/preprocess.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/preprocess.py
deleted file mode 100644
index ed026aa26746ecfda517efa6d77cc2920262641e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/preprocess.py
+++ /dev/null
@@ -1,531 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Utilities for preprocessing TriviaQA data."""
-import bisect
-import json
-import operator
-import os
-import re
-import string
-from typing import Any, Dict, Generator, List, Optional, Set, Text, Tuple
-
-from absl import logging
-import apache_beam as beam
-from apache_beam import metrics
-import dataclasses
-import nltk
-import numpy as np
-import tensorflow.io.gfile as gfile
-
-import sentencepiece as spm
-from official.nlp.projects.triviaqa import evaluation
-from official.nlp.projects.triviaqa import sentencepiece_pb2
-
-
-@dataclasses.dataclass
-class Question(object):
-  id: Text
-  value: Text
-
-
-@dataclasses.dataclass
-class EvidenceInfo(object):
-  id: Text
-  source: Text
-  title: Text
-
-
-@dataclasses.dataclass
-class Evidence(object):
-  info: EvidenceInfo
-  text: Text
-
-
-@dataclasses.dataclass
-class Answer(object):
-  value: Text
-  aliases: List[Text]
-  normalized_aliases: List[Text]
-
-
-@dataclasses.dataclass
-class QuestionAnswer(object):
-  question: Question
-  evidence_info: List[EvidenceInfo]
-  answer: Optional[Answer] = None
-
-
-@dataclasses.dataclass
-class QuestionAnswerEvidence(object):
-  question: Question
-  evidence: Evidence
-  answer: Optional[Answer] = None
-
-
-@dataclasses.dataclass
-class Features(object):
-  id: Text
-  stride_index: int
-  question_id: Text
-  question: Text
-  context: bytes
-  token_ids: List[int]
-  token_offsets: List[int]
-  global_token_ids: List[int]
-  segment_ids: List[int]
-
-
-@dataclasses.dataclass
-class Paragraph(object):
-  sentences: List[sentencepiece_pb2.SentencePieceText]
-  size: int
-
-
-@dataclasses.dataclass
-class AnswerSpan(object):
-  begin: int  # inclusive
-  end: int  # inclusive
-  text: Text
-
-
-def make_paragraph(
-    sentence_tokenizer: nltk.tokenize.api.TokenizerI,
-    processor: spm.SentencePieceProcessor,
-    text: Text,
-    paragraph_metric: Optional[metrics.Metrics.DelegatingDistribution] = None,
-    sentence_metric: Optional[metrics.Metrics.DelegatingDistribution] = None
-) -> Paragraph:
-  """Tokenizes paragraphs."""
-  paragraph_size = 0
-  sentences = []
-  for sentence in sentence_tokenizer.tokenize(text):
-    sentencepiece_text = sentencepiece_pb2.SentencePieceText.FromString(
-        processor.EncodeAsSerializedProto(sentence))
-    paragraph_size += len(sentencepiece_text.pieces)
-    sentences.append(sentencepiece_text)
-    if sentence_metric:
-      sentence_metric.update(len(sentencepiece_text.pieces))
-  if paragraph_metric:
-    paragraph_metric.update(paragraph_size)
-  return Paragraph(sentences=sentences, size=paragraph_size)
-
-
-def read_question_answers(json_path: Text) -> List[QuestionAnswer]:
-  """Read question answers."""
-  with gfile.GFile(json_path) as f:
-    data = json.load(f)['Data']
-  question_answers = []
-  for datum in data:
-    question = Question(id=datum['QuestionId'], value=datum['Question'])
-    if 'Answer' in datum:
-      answer = Answer(
-          value=datum['Answer']['Value'],
-          aliases=datum['Answer']['Aliases'],
-          normalized_aliases=datum['Answer']['NormalizedAliases'])
-    else:
-      answer = None
-    evidence_info = []
-    for key in ['EntityPages', 'SearchResults']:
-      for document in datum.get(key, []):
-        evidence_info.append(
-            EvidenceInfo(
-                id=document['Filename'], title=document['Title'], source=key))
-    question_answers.append(
-        QuestionAnswer(
-            question=question, evidence_info=evidence_info, answer=answer))
-  return question_answers
-
-
-def alias_answer(answer: Text, include=None):
-  alias = answer.replace('_', ' ').lower()
-  exclude = set(string.punctuation + ''.join(['鈥', '鈥', '麓', '`']))
-  include = include or []
-  alias = ''.join(c if c not in exclude or c in include else ' ' for c in alias)
-  return ' '.join(alias.split()).strip()
-
-
-def make_answer_set(answer: Answer) -> Set[Text]:
-  """Apply less aggressive normalization to the answer aliases."""
-  answers = []
-  for alias in [answer.value] + answer.aliases:
-    answers.append(alias_answer(alias))
-    answers.append(alias_answer(alias, [',', '.']))
-    answers.append(alias_answer(alias, ['-']))
-    answers.append(alias_answer(alias, [',', '.', '-']))
-    answers.append(alias_answer(alias, string.punctuation))
-  return set(answers + answer.normalized_aliases)
-
-
-def find_answer_spans(text: bytes, answer_set: Set[Text]) -> List[AnswerSpan]:
-  """Find answer spans."""
-  spans = []
-  for answer in answer_set:
-    answer_regex = re.compile(
-        re.escape(answer).encode('utf-8').replace(b'\\ ', b'[ -]'),
-        flags=re.IGNORECASE)
-    for match in re.finditer(answer_regex, text):
-      spans.append(
-          AnswerSpan(
-              begin=match.start(),
-              end=match.end(),
-              text=match.group(0).decode('utf-8')))
-  return sorted(spans, key=operator.attrgetter('begin'))
-
-
-def realign_answer_span(features: Features, answer_set: Optional[Set[Text]],
-                        processor: spm.SentencePieceProcessor,
-                        span: AnswerSpan) -> Optional[AnswerSpan]:
-  """Align answer span to text with given tokens."""
-  i = bisect.bisect_left(features.token_offsets, span.begin)
-  if i == len(features.token_offsets) or span.begin < features.token_offsets[i]:
-    i -= 1
-  j = i + 1
-  answer_end = span.begin + len(span.text.encode('utf-8'))
-  while (j < len(features.token_offsets) and
-         features.token_offsets[j] < answer_end):
-    j += 1
-  j -= 1
-  sp_answer = (
-      features.context[features.token_offsets[i]:features.token_offsets[j + 1]]
-      if j + 1 < len(features.token_offsets) else
-      features.context[features.token_offsets[i]:])
-  if (processor.IdToPiece(features.token_ids[i]).startswith('鈻') and
-      features.token_offsets[i] > 0):
-    sp_answer = sp_answer[1:]
-  sp_answer = evaluation.normalize_answer(sp_answer.decode('utf-8'))
-  if answer_set is not None and sp_answer not in answer_set:
-    # No need to warn if the cause was breaking word boundaries.
-    if len(sp_answer) and not len(sp_answer) > len(
-        evaluation.normalize_answer(span.text)):
-      logging.warning('%s: "%s" not in %s.', features.question_id, sp_answer,
-                      answer_set)
-    return None
-  return AnswerSpan(begin=i, end=j, text=span.text)
-
-
-def read_sentencepiece_model(path):
-  with gfile.GFile(path, 'rb') as file:
-    processor = spm.SentencePieceProcessor()
-    processor.LoadFromSerializedProto(file.read())
-  return processor
-
-
-class ReadEvidence(beam.DoFn):
-  """Function to read evidence."""
-
-  def __init__(self, wikipedia_dir: Text, web_dir: Text):
-    self._wikipedia_dir = wikipedia_dir
-    self._web_dir = web_dir
-
-  def process(
-      self, question_answer: QuestionAnswer
-  ) -> Generator[QuestionAnswerEvidence, None, None]:
-    for info in question_answer.evidence_info:
-      if info.source == 'EntityPages':
-        evidence_path = os.path.join(self._wikipedia_dir, info.id)
-      elif info.source == 'SearchResult':
-        evidence_path = os.path.join(self._web_dir, info.id)
-      else:
-        raise ValueError(f'Unknown evidence source: {info.source}.')
-      with gfile.GFile(evidence_path, 'rb') as f:
-        text = f.read().decode('utf-8')
-      metrics.Metrics.counter('_', 'documents').inc()
-      yield QuestionAnswerEvidence(
-          question=question_answer.question,
-          evidence=Evidence(info=info, text=text),
-          answer=question_answer.answer)
-
-
-_CLS_PIECE = '<ans>'
-_EOS_PIECE = '</s>'
-_SEP_PIECE = '<sep_0>'
-# _PARAGRAPH_SEP_PIECE = '<sep_1>'
-_NULL_PIECE = '<empty>'
-_QUESTION_PIECE = '<unused_34>'
-
-
-class MakeFeatures(beam.DoFn):
-  """Function to make features."""
-
-  def __init__(self, sentencepiece_model_path: Text, max_num_tokens: int,
-               max_num_global_tokens: int, stride: int):
-    self._sentencepiece_model_path = sentencepiece_model_path
-    self._max_num_tokens = max_num_tokens
-    self._max_num_global_tokens = max_num_global_tokens
-    self._stride = stride
-
-  def setup(self):
-    self._sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
-    self._sentencepiece_processor = read_sentencepiece_model(
-        self._sentencepiece_model_path)
-
-  def _make_features(self, stride_index: int, paragraph_texts: List[Text],
-                     paragraphs: List[Paragraph],
-                     question_answer_evidence: QuestionAnswerEvidence,
-                     ids: List[int],
-                     paragraph_offset: int) -> Tuple[int, Features]:
-    global_ids = (
-        [self._sentencepiece_processor.PieceToId(_CLS_PIECE)] +
-        [self._sentencepiece_processor.PieceToId(_QUESTION_PIECE)] * len(ids))
-    segment_ids = [i + 1 for i in range(len(ids))]  # offset for CLS token
-    token_ids, sentences = [], []
-    offsets, offset, full_text = [-1] * len(ids), 0, True
-    for i in range(paragraph_offset, len(paragraph_texts)):
-      if i < len(paragraphs):
-        paragraph = paragraphs[i]
-      else:
-        paragraphs.append(
-            make_paragraph(
-                self._sentence_tokenizer,
-                self._sentencepiece_processor,
-                paragraph_texts[i],
-                paragraph_metric=metrics.Metrics.distribution(
-                    '_', 'paragraphs'),
-                sentence_metric=metrics.Metrics.distribution('_', 'sentences')))
-        paragraph = paragraphs[-1]
-      for sentence in paragraph.sentences:
-        if (len(ids) + len(token_ids) + len(sentence.pieces) + 1 >=
-            self._max_num_tokens or
-            len(global_ids) >= self._max_num_global_tokens):
-          full_text = False
-          break
-        for j, piece in enumerate(sentence.pieces):
-          token_ids.append(piece.id)
-          segment_ids.append(len(global_ids))
-          offsets.append(offset + piece.begin)
-          if j == 0 and sentences:
-            offsets[-1] -= 1
-        offset += len(sentence.text.encode('utf-8')) + 1
-        global_ids.append(self._sentencepiece_processor.PieceToId(_EOS_PIECE))
-        sentences.append(sentence.text)
-      if not full_text:
-        break
-    context = ' '.join(sentences).encode('utf-8')
-    token_ids.append(self._sentencepiece_processor.PieceToId(_NULL_PIECE))
-    offsets.append(len(context))
-    segment_ids.append(0)
-    next_paragraph_index = len(paragraph_texts)
-    if not full_text and self._stride > 0:
-      shift = paragraphs[paragraph_offset].size
-      next_paragraph_index = paragraph_offset + 1
-      while (next_paragraph_index < len(paragraphs) and
-             shift + paragraphs[next_paragraph_index].size <= self._stride):
-        shift += paragraphs[next_paragraph_index].size
-        next_paragraph_index += 1
-    return next_paragraph_index, Features(
-        id='{}--{}'.format(question_answer_evidence.question.id,
-                           question_answer_evidence.evidence.info.id),
-        stride_index=stride_index,
-        question_id=question_answer_evidence.question.id,
-        question=question_answer_evidence.question.value,
-        context=context,
-        token_ids=ids + token_ids,
-        global_token_ids=global_ids,
-        segment_ids=segment_ids,
-        token_offsets=offsets)
-
-  def process(
-      self, question_answer_evidence: QuestionAnswerEvidence
-  ) -> Generator[Features, None, None]:
-    # Tokenize question which is shared among all examples.
-    ids = (
-        self._sentencepiece_processor.EncodeAsIds(
-            question_answer_evidence.question.value) +
-        [self._sentencepiece_processor.PieceToId(_SEP_PIECE)])
-    paragraph_texts = list(
-        filter(
-            lambda p: p,
-            map(lambda p: p.strip(),
-                question_answer_evidence.evidence.text.split('\n'))))
-    stride_index, paragraphs, paragraph_index = 0, [], 0
-    while paragraph_index < len(paragraph_texts):
-      paragraph_index, features = self._make_features(stride_index,
-                                                      paragraph_texts,
-                                                      paragraphs,
-                                                      question_answer_evidence,
-                                                      ids, paragraph_index)
-      stride_index += 1
-      yield features
-
-
-def _handle_exceptional_examples(
-    features: Features,
-    processor: spm.SentencePieceProcessor) -> List[AnswerSpan]:
-  """Special cases in data."""
-  if features.id == 'qw_6687--Viola.txt':
-    pattern = 'three strings in common鈥擥, D, and A'.encode('utf-8')
-    i = features.context.find(pattern)
-    if i != -1:
-      span = AnswerSpan(i + len(pattern) - 1, i + len(pattern), 'A')
-      span = realign_answer_span(features, None, processor, span)
-      assert span is not None, 'Span should exist.'
-      return [span]
-  if features.id == 'sfq_26183--Vitamin_A.txt':
-    pattern = ('Vitamin A is a group of unsaturated nutritional organic '
-               'compounds that includes retinol').encode('utf-8')
-    i = features.context.find(pattern)
-    if i != -1:
-      span = AnswerSpan(i + pattern.find(b'A'), i + pattern.find(b'A') + 1, 'A')
-      span = realign_answer_span(features, None, processor, span)
-      assert span is not None, 'Span should exist.'
-      spans = [span]
-      span = AnswerSpan(i, i + pattern.find(b'A') + 1, 'Vitamin A')
-      span = realign_answer_span(features, None, processor, span)
-      return spans + [span]
-  if features.id == 'odql_292--Colombia.txt':
-    pattern = b'Colombia is the third-most populous country in Latin America'
-    i = features.context.find(pattern)
-    if i != -1:
-      span = AnswerSpan(i, i + len(b'Colombia'), 'Colombia')
-      span = realign_answer_span(features, None, processor, span)
-      assert span is not None, 'Span should exist.'
-      return [span]
-  if features.id == 'tc_1648--Vietnam.txt':
-    pattern = 'B岷 膼岷'.encode('utf-8')
-    i = features.context.find(pattern)
-    if i != -1:
-      span = AnswerSpan(i, i + len(pattern), 'B岷 膼岷')
-      span = realign_answer_span(features, None, processor, span)
-      assert span is not None, 'Span should exist.'
-      return [span]
-  if features.id == 'sfq_22225--Irish_mythology.txt':
-    pattern = 'T铆r na n脫g'.encode('utf-8')
-    spans = []
-    i = 0
-    while features.context.find(pattern, i) != -1:
-      i = features.context.find(pattern)
-      span = AnswerSpan(i, i + len(pattern), 'T铆r na n脫g')
-      span = realign_answer_span(features, None, processor, span)
-      assert span is not None, 'Span should exist.'
-      spans.append(span)
-      i += len(pattern)
-    return spans
-  return []
-
-
-class FindAnswerSpans(beam.DoFn):
-  """Find answer spans in document."""
-
-  def __init__(self, sentencepiece_model_path: Text):
-    self._sentencepiece_model_path = sentencepiece_model_path
-
-  def setup(self):
-    self._sentencepiece_processor = read_sentencepiece_model(
-        self._sentencepiece_model_path)
-
-  def process(
-      self,
-      element: Tuple[Text, List[Features]],
-      answer_sets: Dict[Text, Set[Text]],
-  ) -> Generator[Tuple[Features, List[AnswerSpan]], None, None]:
-    question_id, features = element
-    answer_set = answer_sets[question_id]
-    has_answer = False
-    for feature in features:
-      answer_spans = []
-      for answer_span in find_answer_spans(feature.context, answer_set):
-        realigned_answer_span = realign_answer_span(
-            feature, answer_set, self._sentencepiece_processor, answer_span)
-        if realigned_answer_span:
-          answer_spans.append(realigned_answer_span)
-      if not answer_spans:
-        answer_spans = _handle_exceptional_examples(
-            feature, self._sentencepiece_processor)
-      if answer_spans:
-        has_answer = True
-      else:
-        metrics.Metrics.counter('_', 'answerless_examples').inc()
-      yield feature, answer_spans
-    if not has_answer:
-      metrics.Metrics.counter('_', 'answerless_questions').inc()
-      logging.error('Question %s has no answer.', question_id)
-
-
-def make_example(
-    features: Features,
-    labels: Optional[List[AnswerSpan]] = None) -> Tuple[Text, Dict[Text, Any]]:
-  """Make an example."""
-  feature = {
-      'id': features.id,
-      'qid': features.question_id,
-      'question': features.question,
-      'context': features.context,
-      'token_ids': features.token_ids,
-      'token_offsets': features.token_offsets,
-      'segment_ids': features.segment_ids,
-      'global_token_ids': features.global_token_ids,
-  }
-  if labels:
-    answers = set((label.begin, label.end) for label in labels)
-    feature['answers'] = np.array([list(answer) for answer in answers],
-                                  np.int64)
-  else:
-    feature['answers'] = np.zeros([0, 2], np.int64)
-  metrics.Metrics.counter('_', 'examples').inc()
-  return f'{features.id}--{features.stride_index}', feature
-
-
-def make_pipeline(root: beam.Pipeline, question_answers: List[QuestionAnswer],
-                  answer: bool, max_num_tokens: int, max_num_global_tokens: int,
-                  stride: int, sentencepiece_model_path: Text,
-                  wikipedia_dir: Text, web_dir: Text):
-  """Makes a Beam pipeline."""
-  question_answers = (
-      root | 'CreateQuestionAnswers' >> beam.Create(question_answers))
-  features = (
-      question_answers
-      | 'ReadEvidence' >> beam.ParDo(
-          ReadEvidence(wikipedia_dir=wikipedia_dir, web_dir=web_dir))
-      | 'MakeFeatures' >> beam.ParDo(
-          MakeFeatures(
-              sentencepiece_model_path=sentencepiece_model_path,
-              max_num_tokens=max_num_tokens,
-              max_num_global_tokens=max_num_global_tokens,
-              stride=stride)))
-  if answer:
-    features = features | 'KeyFeature' >> beam.Map(
-        lambda feature: (feature.question_id, feature))
-    # pylint: disable=g-long-lambda
-    answer_sets = (
-        question_answers
-        | 'MakeAnswerSet' >>
-        beam.Map(lambda qa: (qa.question.id, make_answer_set(qa.answer))))
-    # pylint: enable=g-long-lambda
-    examples = (
-        features
-        | beam.GroupByKey()
-        | 'FindAnswerSpans' >> beam.ParDo(
-            FindAnswerSpans(sentencepiece_model_path),
-            answer_sets=beam.pvalue.AsDict(answer_sets))
-        | 'MakeExamplesWithLabels' >> beam.MapTuple(make_example))
-  else:
-    examples = features | 'MakeExamples' >> beam.Map(make_example)
-  return examples
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/sentencepiece_pb2.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/sentencepiece_pb2.py
deleted file mode 100644
index 424c45998b2015b4d1796aa0462f67fdf66c4168..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/sentencepiece_pb2.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# -*- coding: utf-8 -*-
-# pylint: disable=bad-continuation
-# pylint: disable=protected-access
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-"""Generated protocol buffer code."""
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import message as _message
-from google.protobuf import reflection as _reflection
-from google.protobuf import symbol_database as _symbol_database
-# @@protoc_insertion_point(imports)
-
-_sym_db = _symbol_database.Default()
-
-DESCRIPTOR = _descriptor.FileDescriptor(
-    name='third_party/sentencepiece/src/sentencepiece.proto',
-    package='sentencepiece',
-    syntax='proto2',
-    serialized_options=None,
-    create_key=_descriptor._internal_create_key,
-    serialized_pb=b'\n1third_party/sentencepiece/src/sentencepiece.proto\x12\rsentencepiece\"\xdf\x01\n\x11SentencePieceText\x12\x0c\n\x04text\x18\x01 \x01(\t\x12>\n\x06pieces\x18\x02 \x03(\x0b\x32..sentencepiece.SentencePieceText.SentencePiece\x12\r\n\x05score\x18\x03 \x01(\x02\x1a\x62\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\r\x12\x0f\n\x07surface\x18\x03 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x04 \x01(\r\x12\x0b\n\x03\x65nd\x18\x05 \x01(\r*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"J\n\x16NBestSentencePieceText\x12\x30\n\x06nbests\x18\x01 \x03(\x0b\x32 .sentencepiece.SentencePieceText'
-)
-
-_SENTENCEPIECETEXT_SENTENCEPIECE = _descriptor.Descriptor(
-    name='SentencePiece',
-    full_name='sentencepiece.SentencePieceText.SentencePiece',
-    filename=None,
-    file=DESCRIPTOR,
-    containing_type=None,
-    create_key=_descriptor._internal_create_key,
-    fields=[
-        _descriptor.FieldDescriptor(
-            name='piece',
-            full_name='sentencepiece.SentencePieceText.SentencePiece.piece',
-            index=0,
-            number=1,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=b''.decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-            create_key=_descriptor._internal_create_key),
-        _descriptor.FieldDescriptor(
-            name='id',
-            full_name='sentencepiece.SentencePieceText.SentencePiece.id',
-            index=1,
-            number=2,
-            type=13,
-            cpp_type=3,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-            create_key=_descriptor._internal_create_key),
-        _descriptor.FieldDescriptor(
-            name='surface',
-            full_name='sentencepiece.SentencePieceText.SentencePiece.surface',
-            index=2,
-            number=3,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=b''.decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-            create_key=_descriptor._internal_create_key),
-        _descriptor.FieldDescriptor(
-            name='begin',
-            full_name='sentencepiece.SentencePieceText.SentencePiece.begin',
-            index=3,
-            number=4,
-            type=13,
-            cpp_type=3,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-            create_key=_descriptor._internal_create_key),
-        _descriptor.FieldDescriptor(
-            name='end',
-            full_name='sentencepiece.SentencePieceText.SentencePiece.end',
-            index=4,
-            number=5,
-            type=13,
-            cpp_type=3,
-            label=1,
-            has_default_value=False,
-            default_value=0,
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-            create_key=_descriptor._internal_create_key),
-    ],
-    extensions=[],
-    nested_types=[],
-    enum_types=[],
-    serialized_options=None,
-    is_extendable=True,
-    syntax='proto2',
-    extension_ranges=[
-        (200, 536870912),
-    ],
-    oneofs=[],
-    serialized_start=183,
-    serialized_end=281,
-)
-
-_SENTENCEPIECETEXT = _descriptor.Descriptor(
-    name='SentencePieceText',
-    full_name='sentencepiece.SentencePieceText',
-    filename=None,
-    file=DESCRIPTOR,
-    containing_type=None,
-    create_key=_descriptor._internal_create_key,
-    fields=[
-        _descriptor.FieldDescriptor(
-            name='text',
-            full_name='sentencepiece.SentencePieceText.text',
-            index=0,
-            number=1,
-            type=9,
-            cpp_type=9,
-            label=1,
-            has_default_value=False,
-            default_value=b''.decode('utf-8'),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-            create_key=_descriptor._internal_create_key),
-        _descriptor.FieldDescriptor(
-            name='pieces',
-            full_name='sentencepiece.SentencePieceText.pieces',
-            index=1,
-            number=2,
-            type=11,
-            cpp_type=10,
-            label=3,
-            has_default_value=False,
-            default_value=[],
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-            create_key=_descriptor._internal_create_key),
-        _descriptor.FieldDescriptor(
-            name='score',
-            full_name='sentencepiece.SentencePieceText.score',
-            index=2,
-            number=3,
-            type=2,
-            cpp_type=6,
-            label=1,
-            has_default_value=False,
-            default_value=float(0),
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-            create_key=_descriptor._internal_create_key),
-    ],
-    extensions=[],
-    nested_types=[
-        _SENTENCEPIECETEXT_SENTENCEPIECE,
-    ],
-    enum_types=[],
-    serialized_options=None,
-    is_extendable=True,
-    syntax='proto2',
-    extension_ranges=[
-        (200, 536870912),
-    ],
-    oneofs=[],
-    serialized_start=69,
-    serialized_end=292,
-)
-
-_NBESTSENTENCEPIECETEXT = _descriptor.Descriptor(
-    name='NBestSentencePieceText',
-    full_name='sentencepiece.NBestSentencePieceText',
-    filename=None,
-    file=DESCRIPTOR,
-    containing_type=None,
-    create_key=_descriptor._internal_create_key,
-    fields=[
-        _descriptor.FieldDescriptor(
-            name='nbests',
-            full_name='sentencepiece.NBestSentencePieceText.nbests',
-            index=0,
-            number=1,
-            type=11,
-            cpp_type=10,
-            label=3,
-            has_default_value=False,
-            default_value=[],
-            message_type=None,
-            enum_type=None,
-            containing_type=None,
-            is_extension=False,
-            extension_scope=None,
-            serialized_options=None,
-            file=DESCRIPTOR,
-            create_key=_descriptor._internal_create_key),
-    ],
-    extensions=[],
-    nested_types=[],
-    enum_types=[],
-    serialized_options=None,
-    is_extendable=False,
-    syntax='proto2',
-    extension_ranges=[],
-    oneofs=[],
-    serialized_start=294,
-    serialized_end=368,
-)
-
-_SENTENCEPIECETEXT_SENTENCEPIECE.containing_type = _SENTENCEPIECETEXT
-_SENTENCEPIECETEXT.fields_by_name[
-    'pieces'].message_type = _SENTENCEPIECETEXT_SENTENCEPIECE
-_NBESTSENTENCEPIECETEXT.fields_by_name[
-    'nbests'].message_type = _SENTENCEPIECETEXT
-DESCRIPTOR.message_types_by_name['SentencePieceText'] = _SENTENCEPIECETEXT
-DESCRIPTOR.message_types_by_name[
-    'NBestSentencePieceText'] = _NBESTSENTENCEPIECETEXT
-_sym_db.RegisterFileDescriptor(DESCRIPTOR)
-
-SentencePieceText = _reflection.GeneratedProtocolMessageType(
-    'SentencePieceText',
-    (_message.Message,),
-    {
-        'SentencePiece':
-            _reflection.GeneratedProtocolMessageType(
-                'SentencePiece',
-                (_message.Message,),
-                {
-                    'DESCRIPTOR':
-                        _SENTENCEPIECETEXT_SENTENCEPIECE,
-                    '__module__':
-                        'official.nlp.projects.triviaqa.sentencepiece_pb2'
-                    # @@protoc_insertion_point(class_scope:sentencepiece.SentencePieceText.SentencePiece)
-                }),
-        'DESCRIPTOR':
-            _SENTENCEPIECETEXT,
-        '__module__':
-            'official.nlp.projects.triviaqa.sentencepiece_pb2'
-        # @@protoc_insertion_point(class_scope:sentencepiece.SentencePieceText)
-    })
-_sym_db.RegisterMessage(SentencePieceText)
-_sym_db.RegisterMessage(SentencePieceText.SentencePiece)
-
-NBestSentencePieceText = _reflection.GeneratedProtocolMessageType(
-    'NBestSentencePieceText',
-    (_message.Message,),
-    {
-        'DESCRIPTOR': _NBESTSENTENCEPIECETEXT,
-        '__module__': 'official.nlp.projects.triviaqa.sentencepiece_pb2'
-        # @@protoc_insertion_point(class_scope:sentencepiece.NBestSentencePieceText)
-    })
-_sym_db.RegisterMessage(NBestSentencePieceText)
-
-# @@protoc_insertion_point(module_scope)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/train.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/train.py
deleted file mode 100644
index 44f5c7633b45465164125808b6c7937bdb1a4e88..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/projects/triviaqa/train.py
+++ /dev/null
@@ -1,400 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""TriviaQA training script."""
-import collections
-import contextlib
-import functools
-import json
-import operator
-import os
-
-from absl import app
-from absl import flags
-from absl import logging
-import gin
-import tensorflow as tf
-import tensorflow_datasets as tfds
-
-import sentencepiece as spm
-from official.nlp import optimization as nlp_optimization
-from official.nlp.configs import encoders
-from official.nlp.projects.triviaqa import evaluation
-from official.nlp.projects.triviaqa import inputs
-from official.nlp.projects.triviaqa import modeling
-from official.nlp.projects.triviaqa import prediction
-
-flags.DEFINE_string('data_dir', None, 'Data directory for TensorFlow Datasets.')
-
-flags.DEFINE_string(
-    'validation_gold_path', None,
-    'Path to golden validation. Usually, the wikipedia-dev.json file.')
-
-flags.DEFINE_string('model_dir', None,
-                    'Directory for checkpoints and summaries.')
-
-flags.DEFINE_string('model_config_path', None,
-                    'JSON file containing model coniguration.')
-
-flags.DEFINE_string('sentencepiece_model_path', None,
-                    'Path to sentence piece model.')
-
-flags.DEFINE_enum('encoder', 'bigbird',
-                  ['bert', 'bigbird', 'albert', 'mobilebert'],
-                  'Which transformer encoder model to use.')
-
-flags.DEFINE_integer('bigbird_block_size', 64,
-                     'Size of blocks for sparse block attention.')
-
-flags.DEFINE_string('init_checkpoint_path', None,
-                    'Path from which to initialize weights.')
-
-flags.DEFINE_integer('train_sequence_length', 4096,
-                     'Maximum number of tokens for training.')
-
-flags.DEFINE_integer('train_global_sequence_length', 320,
-                     'Maximum number of global tokens for training.')
-
-flags.DEFINE_integer('validation_sequence_length', 4096,
-                     'Maximum number of tokens for validation.')
-
-flags.DEFINE_integer('validation_global_sequence_length', 320,
-                     'Maximum number of global tokens for validation.')
-
-flags.DEFINE_integer('batch_size', 32, 'Size of batch.')
-
-flags.DEFINE_string('master', '', 'Address of the TPU master.')
-
-flags.DEFINE_integer('decode_top_k', 8,
-                     'Maximum number of tokens to consider for begin/end.')
-
-flags.DEFINE_integer('decode_max_size', 16,
-                     'Maximum number of sentence pieces in an answer.')
-
-flags.DEFINE_float('dropout_rate', 0.1, 'Dropout rate for hidden layers.')
-
-flags.DEFINE_float('attention_dropout_rate', 0.3,
-                   'Dropout rate for attention layers.')
-
-flags.DEFINE_float('label_smoothing', 1e-1, 'Degree of label smoothing.')
-
-flags.DEFINE_multi_string(
-    'gin_bindings', [],
-    'Gin bindings to override the values set in the config files')
-
-FLAGS = flags.FLAGS
-
-
-@contextlib.contextmanager
-def worker_context():
-  if FLAGS.master:
-    with tf.device('/job:worker') as d:
-      yield d
-  else:
-    yield
-
-
-def read_sentencepiece_model(path):
-  with tf.io.gfile.GFile(path, 'rb') as file:
-    processor = spm.SentencePieceProcessor()
-    processor.LoadFromSerializedProto(file.read())
-  return processor
-
-
-# Rename old BERT v1 configuration parameters.
-_MODEL_CONFIG_REPLACEMENTS = {
-    'num_hidden_layers': 'num_layers',
-    'attention_probs_dropout_prob': 'attention_dropout_rate',
-    'hidden_dropout_prob': 'dropout_rate',
-    'hidden_act': 'hidden_activation',
-    'window_size': 'block_size',
-}
-
-
-def read_model_config(encoder,
-                      path,
-                      bigbird_block_size=None) -> encoders.EncoderConfig:
-  """Merges the JSON configuration into the encoder configuration."""
-  with tf.io.gfile.GFile(path) as f:
-    model_config = json.load(f)
-  for key, value in _MODEL_CONFIG_REPLACEMENTS.items():
-    if key in model_config:
-      model_config[value] = model_config.pop(key)
-  model_config['attention_dropout_rate'] = FLAGS.attention_dropout_rate
-  model_config['dropout_rate'] = FLAGS.dropout_rate
-  model_config['block_size'] = bigbird_block_size
-  encoder_config = encoders.EncoderConfig(type=encoder)
-  # Override the default config with those loaded from the JSON file.
-  encoder_config_keys = encoder_config.get().as_dict().keys()
-  overrides = {}
-  for key, value in model_config.items():
-    if key in encoder_config_keys:
-      overrides[key] = value
-    else:
-      logging.warning('Ignoring config parameter %s=%s', key, value)
-  encoder_config.get().override(overrides)
-  return encoder_config
-
-
-@gin.configurable(denylist=[
-    'model',
-    'strategy',
-    'train_dataset',
-    'model_dir',
-    'init_checkpoint_path',
-    'evaluate_fn',
-])
-def fit(model,
-        strategy,
-        train_dataset,
-        model_dir,
-        init_checkpoint_path=None,
-        evaluate_fn=None,
-        learning_rate=1e-5,
-        learning_rate_polynomial_decay_rate=1.,
-        weight_decay_rate=1e-1,
-        num_warmup_steps=5000,
-        num_decay_steps=51000,
-        num_epochs=6):
-  """Train and evaluate."""
-  hparams = dict(
-      learning_rate=learning_rate,
-      num_decay_steps=num_decay_steps,
-      num_warmup_steps=num_warmup_steps,
-      num_epochs=num_epochs,
-      weight_decay_rate=weight_decay_rate,
-      dropout_rate=FLAGS.dropout_rate,
-      attention_dropout_rate=FLAGS.attention_dropout_rate,
-      label_smoothing=FLAGS.label_smoothing)
-  logging.info(hparams)
-  learning_rate_schedule = nlp_optimization.WarmUp(
-      learning_rate,
-      tf.keras.optimizers.schedules.PolynomialDecay(
-          learning_rate,
-          num_decay_steps,
-          end_learning_rate=0.,
-          power=learning_rate_polynomial_decay_rate), num_warmup_steps)
-  with strategy.scope():
-    optimizer = nlp_optimization.AdamWeightDecay(
-        learning_rate_schedule,
-        weight_decay_rate=weight_decay_rate,
-        epsilon=1e-6,
-        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
-    model.compile(optimizer, loss=modeling.SpanOrCrossEntropyLoss())
-
-  def init_fn(init_checkpoint_path):
-    ckpt = tf.train.Checkpoint(encoder=model.encoder)
-    ckpt.restore(init_checkpoint_path).assert_existing_objects_matched()
-
-  with worker_context():
-    ckpt_manager = tf.train.CheckpointManager(
-        tf.train.Checkpoint(model=model, optimizer=optimizer),
-        model_dir,
-        max_to_keep=None,
-        init_fn=(functools.partial(init_fn, init_checkpoint_path)
-                 if init_checkpoint_path else None))
-    with strategy.scope():
-      ckpt_manager.restore_or_initialize()
-    val_summary_writer = tf.summary.create_file_writer(
-        os.path.join(model_dir, 'val'))
-    best_exact_match = 0.
-    for epoch in range(len(ckpt_manager.checkpoints), num_epochs):
-      model.fit(
-          train_dataset,
-          callbacks=[
-              tf.keras.callbacks.TensorBoard(model_dir, write_graph=False),
-          ])
-      ckpt_path = ckpt_manager.save()
-      if evaluate_fn is None:
-        continue
-      metrics = evaluate_fn()
-      logging.info('Epoch %d: %s', epoch + 1, metrics)
-      if best_exact_match < metrics['exact_match']:
-        best_exact_match = metrics['exact_match']
-        model.save(os.path.join(model_dir, 'export'), include_optimizer=False)
-        logging.info('Exporting %s as SavedModel.', ckpt_path)
-      with val_summary_writer.as_default():
-        for name, data in metrics.items():
-          tf.summary.scalar(name, data, epoch + 1)
-
-
-def evaluate(sp_processor, features_map_fn, labels_map_fn, logits_fn,
-             decode_logits_fn, split_and_pad_fn, distribute_strategy,
-             validation_dataset, ground_truth):
-  """Run evaluation."""
-  loss_metric = tf.keras.metrics.Mean()
-
-  @tf.function
-  def update_loss(y, logits):
-    loss_fn = modeling.SpanOrCrossEntropyLoss(
-        reduction=tf.keras.losses.Reduction.NONE)
-    return loss_metric(loss_fn(y, logits))
-
-  predictions = collections.defaultdict(list)
-  for _, (features, labels) in validation_dataset.enumerate():
-    token_ids = features['token_ids']
-    y = labels_map_fn(token_ids, labels)
-    x = split_and_pad_fn(features_map_fn(features))
-    logits = tf.concat(
-        distribute_strategy.experimental_local_results(logits_fn(x)), 0)
-    logits = logits[:features['token_ids'].shape[0]]
-    update_loss(y, logits)
-    end_limit = token_ids.row_lengths() - 1  # inclusive
-    begin, end, scores = decode_logits_fn(logits, end_limit)
-    answers = prediction.decode_answer(features['context'], begin, end,
-                                       features['token_offsets'],
-                                       end_limit).numpy()
-    for _, (qid, token_id, offset, score, answer) in enumerate(
-        zip(features['qid'].numpy(),
-            tf.gather(features['token_ids'], begin, batch_dims=1).numpy(),
-            tf.gather(features['token_offsets'], begin, batch_dims=1).numpy(),
-            scores, answers)):
-      if not answer:
-        continue
-      if sp_processor.IdToPiece(int(token_id)).startswith('鈻') and offset > 0:
-        answer = answer[1:]
-      predictions[qid.decode('utf-8')].append((score, answer.decode('utf-8')))
-  predictions = {
-      qid: evaluation.normalize_answer(
-          sorted(answers, key=operator.itemgetter(0), reverse=True)[0][1])
-      for qid, answers in predictions.items()
-  }
-  metrics = evaluation.evaluate_triviaqa(ground_truth, predictions, mute=True)
-  metrics['loss'] = loss_metric.result().numpy()
-  return metrics
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError('Too many command-line arguments.')
-  gin.parse_config(FLAGS.gin_bindings)
-  model_config = read_model_config(
-      FLAGS.encoder,
-      FLAGS.model_config_path,
-      bigbird_block_size=FLAGS.bigbird_block_size)
-  logging.info(model_config.get().as_dict())
-  # Configure input processing.
-  sp_processor = read_sentencepiece_model(FLAGS.sentencepiece_model_path)
-  features_map_fn = functools.partial(
-      inputs.features_map_fn,
-      local_radius=FLAGS.bigbird_block_size,
-      relative_pos_max_distance=24,
-      use_hard_g2l_mask=True,
-      padding_id=sp_processor.PieceToId('<pad>'),
-      eos_id=sp_processor.PieceToId('</s>'),
-      null_id=sp_processor.PieceToId('<empty>'),
-      cls_id=sp_processor.PieceToId('<ans>'),
-      sep_id=sp_processor.PieceToId('<sep_0>'))
-  train_features_map_fn = tf.function(
-      functools.partial(
-          features_map_fn,
-          sequence_length=FLAGS.train_sequence_length,
-          global_sequence_length=FLAGS.train_global_sequence_length),
-      autograph=False)
-  train_labels_map_fn = tf.function(
-      functools.partial(
-          inputs.labels_map_fn, sequence_length=FLAGS.train_sequence_length))
-  # Connect to TPU cluster.
-  if FLAGS.master:
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(FLAGS.master)
-    tf.config.experimental_connect_to_cluster(resolver)
-    tf.tpu.experimental.initialize_tpu_system(resolver)
-    strategy = tf.distribute.TPUStrategy(resolver)
-  else:
-    strategy = tf.distribute.MirroredStrategy()
-  # Initialize datasets.
-  with worker_context():
-    _ = tf.random.get_global_generator()
-    train_dataset = inputs.read_batches(
-        FLAGS.data_dir,
-        tfds.Split.TRAIN,
-        FLAGS.batch_size,
-        shuffle=True,
-        drop_final_batch=True)
-    validation_dataset = inputs.read_batches(FLAGS.data_dir,
-                                             tfds.Split.VALIDATION,
-                                             FLAGS.batch_size)
-
-    def train_map_fn(x, y):
-      features = train_features_map_fn(x)
-      labels = modeling.smooth_labels(FLAGS.label_smoothing,
-                                      train_labels_map_fn(x['token_ids'], y),
-                                      features['question_lengths'],
-                                      features['token_ids'])
-      return features, labels
-
-    train_dataset = train_dataset.map(train_map_fn, 16).prefetch(16)
-  # Initialize model and compile.
-  with strategy.scope():
-    model = modeling.TriviaQaModel(model_config, FLAGS.train_sequence_length)
-  logits_fn = tf.function(
-      functools.partial(prediction.distributed_logits_fn, model))
-  decode_logits_fn = tf.function(
-      functools.partial(prediction.decode_logits, FLAGS.decode_top_k,
-                        FLAGS.decode_max_size))
-  split_and_pad_fn = tf.function(
-      functools.partial(prediction.split_and_pad, strategy, FLAGS.batch_size))
-  # Evaluation strategy.
-  with tf.io.gfile.GFile(FLAGS.validation_gold_path) as f:
-    ground_truth = {
-        datum['QuestionId']: datum['Answer'] for datum in json.load(f)['Data']
-    }
-  validation_features_map_fn = tf.function(
-      functools.partial(
-          features_map_fn,
-          sequence_length=FLAGS.validation_sequence_length,
-          global_sequence_length=FLAGS.validation_global_sequence_length),
-      autograph=False)
-  validation_labels_map_fn = tf.function(
-      functools.partial(
-          inputs.labels_map_fn,
-          sequence_length=FLAGS.validation_sequence_length))
-  evaluate_fn = functools.partial(
-      evaluate,
-      sp_processor=sp_processor,
-      features_map_fn=validation_features_map_fn,
-      labels_map_fn=validation_labels_map_fn,
-      logits_fn=logits_fn,
-      decode_logits_fn=decode_logits_fn,
-      split_and_pad_fn=split_and_pad_fn,
-      distribute_strategy=strategy,
-      validation_dataset=validation_dataset,
-      ground_truth=ground_truth)
-  logging.info('Model initialized. Beginning training fit loop.')
-  fit(model, strategy, train_dataset, FLAGS.model_dir,
-      FLAGS.init_checkpoint_path, evaluate_fn)
-
-
-if __name__ == '__main__':
-  flags.mark_flags_as_required([
-      'model_config_path', 'model_dir', 'sentencepiece_model_path',
-      'validation_gold_path'
-  ])
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/export_savedmodel.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/export_savedmodel.py
deleted file mode 100644
index 7a4dbc6fea5de41edd8236c474ddc047b1283adb..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/export_savedmodel.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A binary/library to export TF-NLP serving `SavedModel`."""
-import os
-from typing import Any, Dict, Text
-from absl import app
-from absl import flags
-import dataclasses
-import yaml
-from official.core import base_task
-from official.core import task_factory
-from official.modeling import hyperparams
-from official.modeling.hyperparams import base_config
-from official.nlp.serving import export_savedmodel_util
-from official.nlp.serving import serving_modules
-from official.nlp.tasks import masked_lm
-from official.nlp.tasks import question_answering
-from official.nlp.tasks import sentence_prediction
-from official.nlp.tasks import tagging
-
-FLAGS = flags.FLAGS
-
-SERVING_MODULES = {
-    sentence_prediction.SentencePredictionTask:
-        serving_modules.SentencePrediction,
-    masked_lm.MaskedLMTask:
-        serving_modules.MaskedLM,
-    question_answering.QuestionAnsweringTask:
-        serving_modules.QuestionAnswering,
-    tagging.TaggingTask:
-        serving_modules.Tagging
-}
-
-
-def define_flags():
-  """Defines flags."""
-  flags.DEFINE_string("task_name", "SentencePrediction", "The task to export.")
-  flags.DEFINE_string("config_file", None,
-                      "The path to task/experiment yaml config file.")
-  flags.DEFINE_string(
-      "checkpoint_path", None,
-      "Object-based checkpoint path, from the training model directory.")
-  flags.DEFINE_string("export_savedmodel_dir", None,
-                      "Output saved model directory.")
-  flags.DEFINE_string(
-      "serving_params", None,
-      "a YAML/JSON string or csv string for the serving parameters.")
-  flags.DEFINE_string(
-      "function_keys", None,
-      "A string key to retrieve pre-defined serving signatures.")
-  flags.DEFINE_bool("convert_tpu", False, "")
-  flags.DEFINE_multi_integer("allowed_batch_size", None,
-                             "Allowed batch sizes for batching ops.")
-
-
-def lookup_export_module(task: base_task.Task):
-  export_module_cls = SERVING_MODULES.get(task.__class__, None)
-  if export_module_cls is None:
-    ValueError("No registered export module for the task: %s", task.__class__)
-  return export_module_cls
-
-
-def create_export_module(*, task_name: Text, config_file: Text,
-                         serving_params: Dict[Text, Any]):
-  """Creates a ExportModule."""
-  task_config_cls = None
-  task_cls = None
-  # pylint: disable=protected-access
-  for key, value in task_factory._REGISTERED_TASK_CLS.items():
-    print(key.__name__)
-    if task_name in key.__name__:
-      task_config_cls, task_cls = key, value
-      break
-  if task_cls is None:
-    raise ValueError("Failed to identify the task class. The provided task "
-                     f"name is {task_name}")
-  # pylint: enable=protected-access
-  # TODO(hongkuny): Figure out how to separate the task config from experiments.
-
-  @dataclasses.dataclass
-  class Dummy(base_config.Config):
-    task: task_config_cls = task_config_cls()
-
-  dummy_exp = Dummy()
-  dummy_exp = hyperparams.override_params_dict(
-      dummy_exp, config_file, is_strict=False)
-  dummy_exp.task.validation_data = None
-  task = task_cls(dummy_exp.task)
-  model = task.build_model()
-  export_module_cls = lookup_export_module(task)
-  params = export_module_cls.Params(**serving_params)
-  return export_module_cls(params=params, model=model)
-
-
-def main(_):
-  serving_params = yaml.load(
-      hyperparams.nested_csv_str_to_json_str(FLAGS.serving_params),
-      Loader=yaml.FullLoader)
-  export_module = create_export_module(
-      task_name=FLAGS.task_name,
-      config_file=FLAGS.config_file,
-      serving_params=serving_params)
-  export_dir = export_savedmodel_util.export(
-      export_module,
-      function_keys=[FLAGS.function_keys],
-      checkpoint_path=FLAGS.checkpoint_path,
-      export_savedmodel_dir=FLAGS.export_savedmodel_dir)
-
-  if FLAGS.convert_tpu:
-    # pylint: disable=g-import-not-at-top
-    from cloud_tpu.inference_converter import converter_cli
-    from cloud_tpu.inference_converter import converter_options_pb2
-    tpu_dir = os.path.join(export_dir, "tpu")
-    options = converter_options_pb2.ConverterOptions()
-    if FLAGS.allowed_batch_size is not None:
-      allowed_batch_sizes = sorted(FLAGS.allowed_batch_size)
-      options.batch_options.num_batch_threads = 4
-      options.batch_options.max_batch_size = allowed_batch_sizes[-1]
-      options.batch_options.batch_timeout_micros = 100000
-      options.batch_options.allowed_batch_sizes[:] = allowed_batch_sizes
-      options.batch_options.max_enqueued_batches = 1000
-    converter_cli.ConvertSavedModel(
-        export_dir, tpu_dir, function_alias="tpu_candidate", options=options,
-        graph_rewrite_only=True)
-
-
-if __name__ == "__main__":
-  define_flags()
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/export_savedmodel_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/export_savedmodel_test.py
deleted file mode 100644
index 8a9a20b9378a32c81198cc46894d236a83b04299..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/export_savedmodel_test.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for nlp.serving.export_saved_model."""
-
-from absl.testing import parameterized
-
-import tensorflow as tf
-from official.nlp.configs import bert
-from official.nlp.configs import encoders
-from official.nlp.serving import export_savedmodel
-from official.nlp.serving import export_savedmodel_util
-from official.nlp.tasks import masked_lm
-from official.nlp.tasks import sentence_prediction
-from official.nlp.tasks import tagging
-
-
-class ExportSavedModelTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_create_export_module(self):
-    export_module = export_savedmodel.create_export_module(
-        task_name="SentencePrediction",
-        config_file=None,
-        serving_params={
-            "inputs_only": False,
-            "parse_sequence_length": 10
-        })
-    self.assertEqual(export_module.name, "sentence_prediction")
-    self.assertFalse(export_module.params.inputs_only)
-    self.assertEqual(export_module.params.parse_sequence_length, 10)
-
-  def test_sentence_prediction(self):
-    config = sentence_prediction.SentencePredictionConfig(
-        model=sentence_prediction.ModelConfig(
-            encoder=encoders.EncoderConfig(
-                bert=encoders.BertEncoderConfig(vocab_size=30522,
-                                                num_layers=1)),
-            num_classes=2))
-    task = sentence_prediction.SentencePredictionTask(config)
-    model = task.build_model()
-    ckpt = tf.train.Checkpoint(model=model)
-    ckpt_path = ckpt.save(self.get_temp_dir())
-    export_module_cls = export_savedmodel.lookup_export_module(task)
-    serving_params = {"inputs_only": False}
-    params = export_module_cls.Params(**serving_params)
-    export_module = export_module_cls(params=params, model=model)
-    export_dir = export_savedmodel_util.export(
-        export_module,
-        function_keys=["serve"],
-        checkpoint_path=ckpt_path,
-        export_savedmodel_dir=self.get_temp_dir())
-    imported = tf.saved_model.load(export_dir)
-    serving_fn = imported.signatures["serving_default"]
-
-    dummy_ids = tf.ones((1, 5), dtype=tf.int32)
-    inputs = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    ref_outputs = model(inputs)
-    outputs = serving_fn(**inputs)
-    self.assertAllClose(ref_outputs, outputs["outputs"])
-    self.assertEqual(outputs["outputs"].shape, (1, 2))
-
-  def test_masked_lm(self):
-    config = masked_lm.MaskedLMConfig(
-        model=bert.PretrainerConfig(
-            encoder=encoders.EncoderConfig(
-                bert=encoders.BertEncoderConfig(vocab_size=30522,
-                                                num_layers=1)),
-            cls_heads=[
-                bert.ClsHeadConfig(inner_dim=10, num_classes=2, name="foo")
-            ]))
-    task = masked_lm.MaskedLMTask(config)
-    model = task.build_model()
-    ckpt = tf.train.Checkpoint(model=model)
-    ckpt_path = ckpt.save(self.get_temp_dir())
-    export_module_cls = export_savedmodel.lookup_export_module(task)
-    serving_params = {
-        "cls_head_name": "foo",
-        "parse_sequence_length": 10,
-        "max_predictions_per_seq": 5
-    }
-    params = export_module_cls.Params(**serving_params)
-    export_module = export_module_cls(params=params, model=model)
-    export_dir = export_savedmodel_util.export(
-        export_module,
-        function_keys={
-            "serve": "serving_default",
-            "serve_examples": "serving_examples"
-        },
-        checkpoint_path=ckpt_path,
-        export_savedmodel_dir=self.get_temp_dir())
-    imported = tf.saved_model.load(export_dir)
-    self.assertSameElements(imported.signatures.keys(),
-                            ["serving_default", "serving_examples"])
-    serving_fn = imported.signatures["serving_default"]
-    dummy_ids = tf.ones((1, 10), dtype=tf.int32)
-    dummy_pos = tf.ones((1, 5), dtype=tf.int32)
-    outputs = serving_fn(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids,
-        masked_lm_positions=dummy_pos)
-    self.assertEqual(outputs["classification"].shape, (1, 2))
-
-  @parameterized.parameters(True, False)
-  def test_tagging(self, output_encoder_outputs):
-    hidden_size = 768
-    num_classes = 3
-    config = tagging.TaggingConfig(
-        model=tagging.ModelConfig(
-            encoder=encoders.EncoderConfig(
-                bert=encoders.BertEncoderConfig(
-                    hidden_size=hidden_size, num_layers=1))),
-        class_names=["class_0", "class_1", "class_2"])
-    task = tagging.TaggingTask(config)
-    model = task.build_model()
-    ckpt = tf.train.Checkpoint(model=model)
-    ckpt_path = ckpt.save(self.get_temp_dir())
-    export_module_cls = export_savedmodel.lookup_export_module(task)
-    serving_params = {
-        "parse_sequence_length": 10,
-    }
-    params = export_module_cls.Params(
-        **serving_params, output_encoder_outputs=output_encoder_outputs)
-    export_module = export_module_cls(params=params, model=model)
-    export_dir = export_savedmodel_util.export(
-        export_module,
-        function_keys={
-            "serve": "serving_default",
-            "serve_examples": "serving_examples"
-        },
-        checkpoint_path=ckpt_path,
-        export_savedmodel_dir=self.get_temp_dir())
-    imported = tf.saved_model.load(export_dir)
-    self.assertCountEqual(imported.signatures.keys(),
-                          ["serving_default", "serving_examples"])
-
-    serving_fn = imported.signatures["serving_default"]
-    dummy_ids = tf.ones((1, 5), dtype=tf.int32)
-    inputs = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    outputs = serving_fn(**inputs)
-    self.assertEqual(outputs["logits"].shape, (1, 5, num_classes))
-    if output_encoder_outputs:
-      self.assertEqual(outputs["encoder_outputs"].shape, (1, 5, hidden_size))
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/export_savedmodel_util.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/export_savedmodel_util.py
deleted file mode 100644
index dd3bc5b0b674e88ac748be86935338ee46d1740b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/export_savedmodel_util.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Common library to export a SavedModel from the export module."""
-from typing import Dict, List, Optional, Text, Union
-
-import tensorflow as tf
-from official.core import export_base
-
-
-def export(export_module: export_base.ExportModule,
-           function_keys: Union[List[Text], Dict[Text, Text]],
-           export_savedmodel_dir: Text,
-           checkpoint_path: Optional[Text] = None,
-           timestamped: bool = True) -> Text:
-  """Exports to SavedModel format.
-
-  Args:
-    export_module: a ExportModule with the keras Model and serving tf.functions.
-    function_keys: a list of string keys to retrieve pre-defined serving
-      signatures. The signaute keys will be set with defaults. If a dictionary
-      is provided, the values will be used as signature keys.
-    export_savedmodel_dir: Output saved model directory.
-    checkpoint_path: Object-based checkpoint path or directory.
-    timestamped: Whether to export the savedmodel to a timestamped directory.
-
-  Returns:
-    The savedmodel directory path.
-  """
-  save_options = tf.saved_model.SaveOptions(function_aliases={
-      "tpu_candidate": export_module.serve,
-  })
-  return export_base.export(export_module, function_keys, export_savedmodel_dir,
-                            checkpoint_path, timestamped, save_options)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/serving_modules.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/serving_modules.py
deleted file mode 100644
index a6d27fe8999475362c448c02b3ab4c34a07c2f45..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/serving_modules.py
+++ /dev/null
@@ -1,407 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Serving export modules for TF Model Garden NLP models."""
-# pylint:disable=missing-class-docstring
-from typing import Dict, List, Optional, Text
-
-import dataclasses
-import tensorflow as tf
-from official.core import export_base
-from official.modeling.hyperparams import base_config
-from official.nlp.data import sentence_prediction_dataloader
-
-
-def features_to_int32(features: Dict[str, tf.Tensor]) -> Dict[str, tf.Tensor]:
-  """Converts tf.int64 features to tf.int32, keep other features the same.
-
-  tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-
-  Args:
-    features: Input tensor dictionary.
-
-  Returns:
-    Features with tf.int64 converted to tf.int32.
-  """
-  converted_features = {}
-  for name, tensor in features.items():
-    if tensor.dtype == tf.int64:
-      converted_features[name] = tf.cast(tensor, tf.int32)
-    else:
-      converted_features[name] = tensor
-  return converted_features
-
-
-class SentencePrediction(export_base.ExportModule):
-  """The export module for the sentence prediction task."""
-
-  @dataclasses.dataclass
-  class Params(base_config.Config):
-    inputs_only: bool = True
-    parse_sequence_length: Optional[int] = None
-    use_v2_feature_names: bool = True
-
-    # For text input processing.
-    text_fields: Optional[List[str]] = None
-    # Either specify these values for preprocessing by Python code...
-    tokenization: str = "WordPiece"  # WordPiece or SentencePiece
-    # Text vocab file if tokenization is WordPiece, or sentencepiece.ModelProto
-    # file if tokenization is SentencePiece.
-    vocab_file: str = ""
-    lower_case: bool = True
-    # ...or load preprocessing from a SavedModel at this location.
-    preprocessing_hub_module_url: str = ""
-
-  def __init__(self, params, model: tf.keras.Model, inference_step=None):
-    super().__init__(params, model, inference_step)
-    if params.use_v2_feature_names:
-      self.input_word_ids_field = "input_word_ids"
-      self.input_type_ids_field = "input_type_ids"
-    else:
-      self.input_word_ids_field = "input_ids"
-      self.input_type_ids_field = "segment_ids"
-
-    if params.text_fields:
-      self._text_processor = sentence_prediction_dataloader.TextProcessor(
-          seq_length=params.parse_sequence_length,
-          vocab_file=params.vocab_file,
-          tokenization=params.tokenization,
-          lower_case=params.lower_case,
-          preprocessing_hub_module_url=params.preprocessing_hub_module_url)
-
-  @tf.function
-  def serve(self,
-            input_word_ids,
-            input_mask=None,
-            input_type_ids=None) -> Dict[str, tf.Tensor]:
-    if input_type_ids is None:
-      # Requires CLS token is the first token of inputs.
-      input_type_ids = tf.zeros_like(input_word_ids)
-    if input_mask is None:
-      # The mask has 1 for real tokens and 0 for padding tokens.
-      input_mask = tf.where(
-          tf.equal(input_word_ids, 0), tf.zeros_like(input_word_ids),
-          tf.ones_like(input_word_ids))
-    inputs = dict(
-        input_word_ids=input_word_ids,
-        input_mask=input_mask,
-        input_type_ids=input_type_ids)
-    return dict(outputs=self.inference_step(inputs))
-
-  @tf.function
-  def serve_examples(self, inputs) -> Dict[str, tf.Tensor]:
-    sequence_length = self.params.parse_sequence_length
-    inputs_only = self.params.inputs_only
-    name_to_features = {
-        self.input_word_ids_field:
-            tf.io.FixedLenFeature([sequence_length], tf.int64),
-    }
-    if not inputs_only:
-      name_to_features.update({
-          "input_mask":
-              tf.io.FixedLenFeature([sequence_length], tf.int64),
-          self.input_type_ids_field:
-              tf.io.FixedLenFeature([sequence_length], tf.int64)
-      })
-    features = tf.io.parse_example(inputs, name_to_features)
-    features = features_to_int32(features)
-    return self.serve(
-        features[self.input_word_ids_field],
-        input_mask=None if inputs_only else features["input_mask"],
-        input_type_ids=None
-        if inputs_only else features[self.input_type_ids_field])
-
-  @tf.function
-  def serve_text_examples(self, inputs) -> Dict[str, tf.Tensor]:
-    name_to_features = {}
-    for text_field in self.params.text_fields:
-      name_to_features[text_field] = tf.io.FixedLenFeature([], tf.string)
-    features = tf.io.parse_example(inputs, name_to_features)
-    segments = [features[x] for x in self.params.text_fields]
-    model_inputs = self._text_processor(segments)
-    if self.params.inputs_only:
-      return self.serve(input_word_ids=model_inputs["input_word_ids"])
-    return self.serve(**model_inputs)
-
-  def get_inference_signatures(self, function_keys: Dict[Text, Text]):
-    signatures = {}
-    valid_keys = ("serve", "serve_examples", "serve_text_examples")
-    for func_key, signature_key in function_keys.items():
-      if func_key not in valid_keys:
-        raise ValueError("Invalid function key for the module: %s with key %s. "
-                         "Valid keys are: %s" %
-                         (self.__class__, func_key, valid_keys))
-      if func_key == "serve":
-        if self.params.inputs_only:
-          signatures[signature_key] = self.serve.get_concrete_function(
-              input_word_ids=tf.TensorSpec(
-                  shape=[None, None], dtype=tf.int32, name="input_word_ids"))
-        else:
-          signatures[signature_key] = self.serve.get_concrete_function(
-              input_word_ids=tf.TensorSpec(
-                  shape=[None, None], dtype=tf.int32, name="input_word_ids"),
-              input_mask=tf.TensorSpec(
-                  shape=[None, None], dtype=tf.int32, name="input_mask"),
-              input_type_ids=tf.TensorSpec(
-                  shape=[None, None], dtype=tf.int32, name="input_type_ids"))
-      if func_key == "serve_examples":
-        signatures[signature_key] = self.serve_examples.get_concrete_function(
-            tf.TensorSpec(shape=[None], dtype=tf.string, name="examples"))
-      if func_key == "serve_text_examples":
-        signatures[
-            signature_key] = self.serve_text_examples.get_concrete_function(
-                tf.TensorSpec(shape=[None], dtype=tf.string, name="examples"))
-    return signatures
-
-
-class MaskedLM(export_base.ExportModule):
-  """The export module for the Bert Pretrain (MaskedLM) task."""
-
-  def __init__(self, params, model: tf.keras.Model, inference_step=None):
-    super().__init__(params, model, inference_step)
-    if params.use_v2_feature_names:
-      self.input_word_ids_field = "input_word_ids"
-      self.input_type_ids_field = "input_type_ids"
-    else:
-      self.input_word_ids_field = "input_ids"
-      self.input_type_ids_field = "segment_ids"
-
-  @dataclasses.dataclass
-  class Params(base_config.Config):
-    cls_head_name: str = "next_sentence"
-    use_v2_feature_names: bool = True
-    parse_sequence_length: Optional[int] = None
-    max_predictions_per_seq: Optional[int] = None
-
-  @tf.function
-  def serve(self, input_word_ids, input_mask, input_type_ids,
-            masked_lm_positions) -> Dict[str, tf.Tensor]:
-    inputs = dict(
-        input_word_ids=input_word_ids,
-        input_mask=input_mask,
-        input_type_ids=input_type_ids,
-        masked_lm_positions=masked_lm_positions)
-    outputs = self.inference_step(inputs)
-    return dict(classification=outputs[self.params.cls_head_name])
-
-  @tf.function
-  def serve_examples(self, inputs) -> Dict[str, tf.Tensor]:
-    sequence_length = self.params.parse_sequence_length
-    max_predictions_per_seq = self.params.max_predictions_per_seq
-    name_to_features = {
-        self.input_word_ids_field:
-            tf.io.FixedLenFeature([sequence_length], tf.int64),
-        "input_mask":
-            tf.io.FixedLenFeature([sequence_length], tf.int64),
-        self.input_type_ids_field:
-            tf.io.FixedLenFeature([sequence_length], tf.int64),
-        "masked_lm_positions":
-            tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64)
-    }
-    features = tf.io.parse_example(inputs, name_to_features)
-    features = features_to_int32(features)
-    return self.serve(
-        input_word_ids=features[self.input_word_ids_field],
-        input_mask=features["input_mask"],
-        input_type_ids=features[self.input_word_ids_field],
-        masked_lm_positions=features["masked_lm_positions"])
-
-  def get_inference_signatures(self, function_keys: Dict[Text, Text]):
-    signatures = {}
-    valid_keys = ("serve", "serve_examples")
-    for func_key, signature_key in function_keys.items():
-      if func_key not in valid_keys:
-        raise ValueError("Invalid function key for the module: %s with key %s. "
-                         "Valid keys are: %s" %
-                         (self.__class__, func_key, valid_keys))
-      if func_key == "serve":
-        signatures[signature_key] = self.serve.get_concrete_function(
-            input_word_ids=tf.TensorSpec(
-                shape=[None, None], dtype=tf.int32, name="input_word_ids"),
-            input_mask=tf.TensorSpec(
-                shape=[None, None], dtype=tf.int32, name="input_mask"),
-            input_type_ids=tf.TensorSpec(
-                shape=[None, None], dtype=tf.int32, name="input_type_ids"),
-            masked_lm_positions=tf.TensorSpec(
-                shape=[None, None], dtype=tf.int32, name="masked_lm_positions"))
-      if func_key == "serve_examples":
-        signatures[signature_key] = self.serve_examples.get_concrete_function(
-            tf.TensorSpec(shape=[None], dtype=tf.string, name="examples"))
-    return signatures
-
-
-class QuestionAnswering(export_base.ExportModule):
-  """The export module for the question answering task."""
-
-  @dataclasses.dataclass
-  class Params(base_config.Config):
-    parse_sequence_length: Optional[int] = None
-    use_v2_feature_names: bool = True
-
-  def __init__(self, params, model: tf.keras.Model, inference_step=None):
-    super().__init__(params, model, inference_step)
-    if params.use_v2_feature_names:
-      self.input_word_ids_field = "input_word_ids"
-      self.input_type_ids_field = "input_type_ids"
-    else:
-      self.input_word_ids_field = "input_ids"
-      self.input_type_ids_field = "segment_ids"
-
-  @tf.function
-  def serve(self,
-            input_word_ids,
-            input_mask=None,
-            input_type_ids=None) -> Dict[str, tf.Tensor]:
-    if input_mask is None:
-      # The mask has 1 for real tokens and 0 for padding tokens.
-      input_mask = tf.where(
-          tf.equal(input_word_ids, 0), tf.zeros_like(input_word_ids),
-          tf.ones_like(input_word_ids))
-    inputs = dict(
-        input_word_ids=input_word_ids,
-        input_mask=input_mask,
-        input_type_ids=input_type_ids)
-    outputs = self.inference_step(inputs)
-    return dict(start_logits=outputs[0], end_logits=outputs[1])
-
-  @tf.function
-  def serve_examples(self, inputs) -> Dict[str, tf.Tensor]:
-    sequence_length = self.params.parse_sequence_length
-    name_to_features = {
-        self.input_word_ids_field:
-            tf.io.FixedLenFeature([sequence_length], tf.int64),
-        "input_mask":
-            tf.io.FixedLenFeature([sequence_length], tf.int64),
-        self.input_type_ids_field:
-            tf.io.FixedLenFeature([sequence_length], tf.int64)
-    }
-    features = tf.io.parse_example(inputs, name_to_features)
-    features = features_to_int32(features)
-    return self.serve(
-        input_word_ids=features[self.input_word_ids_field],
-        input_mask=features["input_mask"],
-        input_type_ids=features[self.input_type_ids_field])
-
-  def get_inference_signatures(self, function_keys: Dict[Text, Text]):
-    signatures = {}
-    valid_keys = ("serve", "serve_examples")
-    for func_key, signature_key in function_keys.items():
-      if func_key not in valid_keys:
-        raise ValueError("Invalid function key for the module: %s with key %s. "
-                         "Valid keys are: %s" %
-                         (self.__class__, func_key, valid_keys))
-      if func_key == "serve":
-        signatures[signature_key] = self.serve.get_concrete_function(
-            input_word_ids=tf.TensorSpec(
-                shape=[None, None], dtype=tf.int32, name="input_word_ids"),
-            input_mask=tf.TensorSpec(
-                shape=[None, None], dtype=tf.int32, name="input_mask"),
-            input_type_ids=tf.TensorSpec(
-                shape=[None, None], dtype=tf.int32, name="input_type_ids"))
-      if func_key == "serve_examples":
-        signatures[signature_key] = self.serve_examples.get_concrete_function(
-            tf.TensorSpec(shape=[None], dtype=tf.string, name="examples"))
-    return signatures
-
-
-class Tagging(export_base.ExportModule):
-  """The export module for the tagging task."""
-
-  @dataclasses.dataclass
-  class Params(base_config.Config):
-    parse_sequence_length: Optional[int] = None
-    use_v2_feature_names: bool = True
-    output_encoder_outputs: bool = False
-
-  def __init__(self, params, model: tf.keras.Model, inference_step=None):
-    super().__init__(params, model, inference_step)
-    if params.use_v2_feature_names:
-      self.input_word_ids_field = "input_word_ids"
-      self.input_type_ids_field = "input_type_ids"
-    else:
-      self.input_word_ids_field = "input_ids"
-      self.input_type_ids_field = "segment_ids"
-
-  @tf.function
-  def serve(self, input_word_ids, input_mask,
-            input_type_ids) -> Dict[str, tf.Tensor]:
-    inputs = dict(
-        input_word_ids=input_word_ids,
-        input_mask=input_mask,
-        input_type_ids=input_type_ids)
-    outputs = self.inference_step(inputs)
-    if self.params.output_encoder_outputs:
-      return dict(
-          logits=outputs["logits"], encoder_outputs=outputs["encoder_outputs"])
-    else:
-      return dict(logits=outputs["logits"])
-
-  @tf.function
-  def serve_examples(self, inputs) -> Dict[str, tf.Tensor]:
-    sequence_length = self.params.parse_sequence_length
-    name_to_features = {
-        self.input_word_ids_field:
-            tf.io.FixedLenFeature([sequence_length], tf.int64),
-        "input_mask":
-            tf.io.FixedLenFeature([sequence_length], tf.int64),
-        self.input_type_ids_field:
-            tf.io.FixedLenFeature([sequence_length], tf.int64)
-    }
-    features = tf.io.parse_example(inputs, name_to_features)
-    features = features_to_int32(features)
-    return self.serve(
-        input_word_ids=features[self.input_word_ids_field],
-        input_mask=features["input_mask"],
-        input_type_ids=features[self.input_type_ids_field])
-
-  def get_inference_signatures(self, function_keys: Dict[Text, Text]):
-    signatures = {}
-    valid_keys = ("serve", "serve_examples")
-    for func_key, signature_key in function_keys.items():
-      if func_key not in valid_keys:
-        raise ValueError("Invalid function key for the module: %s with key %s. "
-                         "Valid keys are: %s" %
-                         (self.__class__, func_key, valid_keys))
-      if func_key == "serve":
-        signatures[signature_key] = self.serve.get_concrete_function(
-            input_word_ids=tf.TensorSpec(
-                shape=[None, None],
-                dtype=tf.int32,
-                name=self.input_word_ids_field),
-            input_mask=tf.TensorSpec(
-                shape=[None, None], dtype=tf.int32, name="input_mask"),
-            input_type_ids=tf.TensorSpec(
-                shape=[None, None],
-                dtype=tf.int32,
-                name=self.input_type_ids_field))
-      if func_key == "serve_examples":
-        signatures[signature_key] = self.serve_examples.get_concrete_function(
-            tf.TensorSpec(shape=[None], dtype=tf.string, name="examples"))
-    return signatures
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/serving_modules_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/serving_modules_test.py
deleted file mode 100644
index 8479cf9aa5eeef0083c5e4887c4fa116c8dca613..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/serving/serving_modules_test.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for nlp.serving.serving_modules."""
-
-import os
-from absl.testing import parameterized
-import tensorflow as tf
-from official.nlp.configs import bert
-from official.nlp.configs import encoders
-from official.nlp.serving import serving_modules
-from official.nlp.tasks import masked_lm
-from official.nlp.tasks import question_answering
-from official.nlp.tasks import sentence_prediction
-from official.nlp.tasks import tagging
-
-
-def _create_fake_serialized_examples(features_dict):
-  """Creates a fake dataset."""
-
-  def create_int_feature(values):
-    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-    return f
-
-  def create_str_feature(value):
-    f = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
-    return f
-
-  examples = []
-  for _ in range(10):
-    features = {}
-    for key, values in features_dict.items():
-      if isinstance(values, bytes):
-        features[key] = create_str_feature(values)
-      else:
-        features[key] = create_int_feature(values)
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    examples.append(tf_example.SerializeToString())
-  return tf.constant(examples)
-
-
-def _create_fake_vocab_file(vocab_file_path):
-  tokens = ["[PAD]"]
-  for i in range(1, 100):
-    tokens.append("[unused%d]" % i)
-  tokens.extend(["[UNK]", "[CLS]", "[SEP]", "[MASK]", "hello", "world"])
-  with tf.io.gfile.GFile(vocab_file_path, "w") as outfile:
-    outfile.write("\n".join(tokens))
-
-
-class ServingModulesTest(tf.test.TestCase, parameterized.TestCase):
-
-  @parameterized.parameters(
-      # use_v2_feature_names
-      True,
-      False)
-  def test_sentence_prediction(self, use_v2_feature_names):
-    if use_v2_feature_names:
-      input_word_ids_field = "input_word_ids"
-      input_type_ids_field = "input_type_ids"
-    else:
-      input_word_ids_field = "input_ids"
-      input_type_ids_field = "segment_ids"
-
-    config = sentence_prediction.SentencePredictionConfig(
-        model=sentence_prediction.ModelConfig(
-            encoder=encoders.EncoderConfig(
-                bert=encoders.BertEncoderConfig(vocab_size=30522,
-                                                num_layers=1)),
-            num_classes=2))
-    task = sentence_prediction.SentencePredictionTask(config)
-    model = task.build_model()
-    params = serving_modules.SentencePrediction.Params(
-        inputs_only=True,
-        parse_sequence_length=10,
-        use_v2_feature_names=use_v2_feature_names)
-    export_module = serving_modules.SentencePrediction(
-        params=params, model=model)
-    functions = export_module.get_inference_signatures({
-        "serve": "serving_default",
-        "serve_examples": "serving_examples"
-    })
-    self.assertSameElements(functions.keys(),
-                            ["serving_default", "serving_examples"])
-    dummy_ids = tf.ones((10, 10), dtype=tf.int32)
-    outputs = functions["serving_default"](dummy_ids)
-    self.assertEqual(outputs["outputs"].shape, (10, 2))
-
-    params = serving_modules.SentencePrediction.Params(
-        inputs_only=False,
-        parse_sequence_length=10,
-        use_v2_feature_names=use_v2_feature_names)
-    export_module = serving_modules.SentencePrediction(
-        params=params, model=model)
-    functions = export_module.get_inference_signatures({
-        "serve": "serving_default",
-        "serve_examples": "serving_examples"
-    })
-    outputs = functions["serving_default"](
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    self.assertEqual(outputs["outputs"].shape, (10, 2))
-
-    dummy_ids = tf.ones((10,), dtype=tf.int32)
-    examples = _create_fake_serialized_examples({
-        input_word_ids_field: dummy_ids,
-        "input_mask": dummy_ids,
-        input_type_ids_field: dummy_ids
-    })
-    outputs = functions["serving_examples"](examples)
-    self.assertEqual(outputs["outputs"].shape, (10, 2))
-
-    with self.assertRaises(ValueError):
-      _ = export_module.get_inference_signatures({"foo": None})
-
-  @parameterized.parameters(
-      # inputs_only
-      True,
-      False)
-  def test_sentence_prediction_text(self, inputs_only):
-    vocab_file_path = os.path.join(self.get_temp_dir(), "vocab.txt")
-    _create_fake_vocab_file(vocab_file_path)
-    config = sentence_prediction.SentencePredictionConfig(
-        model=sentence_prediction.ModelConfig(
-            encoder=encoders.EncoderConfig(
-                bert=encoders.BertEncoderConfig(vocab_size=30522,
-                                                num_layers=1)),
-            num_classes=2))
-    task = sentence_prediction.SentencePredictionTask(config)
-    model = task.build_model()
-    params = serving_modules.SentencePrediction.Params(
-        inputs_only=inputs_only,
-        parse_sequence_length=10,
-        text_fields=["foo", "bar"],
-        vocab_file=vocab_file_path)
-    export_module = serving_modules.SentencePrediction(
-        params=params, model=model)
-    examples = _create_fake_serialized_examples({
-        "foo": b"hello world",
-        "bar": b"hello world"
-    })
-    functions = export_module.get_inference_signatures({
-        "serve_text_examples": "serving_default",
-    })
-    outputs = functions["serving_default"](examples)
-    self.assertEqual(outputs["outputs"].shape, (10, 2))
-
-  @parameterized.parameters(
-      # use_v2_feature_names
-      True,
-      False)
-  def test_masked_lm(self, use_v2_feature_names):
-    if use_v2_feature_names:
-      input_word_ids_field = "input_word_ids"
-      input_type_ids_field = "input_type_ids"
-    else:
-      input_word_ids_field = "input_ids"
-      input_type_ids_field = "segment_ids"
-    config = masked_lm.MaskedLMConfig(
-        model=bert.PretrainerConfig(
-            encoder=encoders.EncoderConfig(
-                bert=encoders.BertEncoderConfig(vocab_size=30522,
-                                                num_layers=1)),
-            cls_heads=[
-                bert.ClsHeadConfig(
-                    inner_dim=10, num_classes=2, name="next_sentence")
-            ]))
-    task = masked_lm.MaskedLMTask(config)
-    model = task.build_model()
-    params = serving_modules.MaskedLM.Params(
-        parse_sequence_length=10,
-        max_predictions_per_seq=5,
-        use_v2_feature_names=use_v2_feature_names)
-    export_module = serving_modules.MaskedLM(params=params, model=model)
-    functions = export_module.get_inference_signatures({
-        "serve": "serving_default",
-        "serve_examples": "serving_examples"
-    })
-    self.assertSameElements(functions.keys(),
-                            ["serving_default", "serving_examples"])
-    dummy_ids = tf.ones((10, 10), dtype=tf.int32)
-    dummy_pos = tf.ones((10, 5), dtype=tf.int32)
-    outputs = functions["serving_default"](
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids,
-        masked_lm_positions=dummy_pos)
-    self.assertEqual(outputs["classification"].shape, (10, 2))
-
-    dummy_ids = tf.ones((10,), dtype=tf.int32)
-    dummy_pos = tf.ones((5,), dtype=tf.int32)
-    examples = _create_fake_serialized_examples({
-        input_word_ids_field: dummy_ids,
-        "input_mask": dummy_ids,
-        input_type_ids_field: dummy_ids,
-        "masked_lm_positions": dummy_pos
-    })
-    outputs = functions["serving_examples"](examples)
-    self.assertEqual(outputs["classification"].shape, (10, 2))
-
-  @parameterized.parameters(
-      # use_v2_feature_names
-      True,
-      False)
-  def test_question_answering(self, use_v2_feature_names):
-    if use_v2_feature_names:
-      input_word_ids_field = "input_word_ids"
-      input_type_ids_field = "input_type_ids"
-    else:
-      input_word_ids_field = "input_ids"
-      input_type_ids_field = "segment_ids"
-
-    config = question_answering.QuestionAnsweringConfig(
-        model=question_answering.ModelConfig(
-            encoder=encoders.EncoderConfig(
-                bert=encoders.BertEncoderConfig(vocab_size=30522,
-                                                num_layers=1))),
-        validation_data=None)
-    task = question_answering.QuestionAnsweringTask(config)
-    model = task.build_model()
-    params = serving_modules.QuestionAnswering.Params(
-        parse_sequence_length=10, use_v2_feature_names=use_v2_feature_names)
-    export_module = serving_modules.QuestionAnswering(
-        params=params, model=model)
-    functions = export_module.get_inference_signatures({
-        "serve": "serving_default",
-        "serve_examples": "serving_examples"
-    })
-    self.assertSameElements(functions.keys(),
-                            ["serving_default", "serving_examples"])
-    dummy_ids = tf.ones((10, 10), dtype=tf.int32)
-    outputs = functions["serving_default"](
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    self.assertEqual(outputs["start_logits"].shape, (10, 10))
-    self.assertEqual(outputs["end_logits"].shape, (10, 10))
-    dummy_ids = tf.ones((10,), dtype=tf.int32)
-    examples = _create_fake_serialized_examples({
-        input_word_ids_field: dummy_ids,
-        "input_mask": dummy_ids,
-        input_type_ids_field: dummy_ids
-    })
-    outputs = functions["serving_examples"](examples)
-    self.assertEqual(outputs["start_logits"].shape, (10, 10))
-    self.assertEqual(outputs["end_logits"].shape, (10, 10))
-
-  @parameterized.parameters(
-      # (use_v2_feature_names, output_encoder_outputs)
-      (True, True),
-      (False, False))
-  def test_tagging(self, use_v2_feature_names, output_encoder_outputs):
-    if use_v2_feature_names:
-      input_word_ids_field = "input_word_ids"
-      input_type_ids_field = "input_type_ids"
-    else:
-      input_word_ids_field = "input_ids"
-      input_type_ids_field = "segment_ids"
-
-    hidden_size = 768
-    num_classes = 3
-    config = tagging.TaggingConfig(
-        model=tagging.ModelConfig(
-            encoder=encoders.EncoderConfig(
-                bert=encoders.BertEncoderConfig(
-                    hidden_size=hidden_size, num_layers=1))),
-        class_names=["class_0", "class_1", "class_2"])
-    task = tagging.TaggingTask(config)
-    model = task.build_model()
-
-    params = serving_modules.Tagging.Params(
-        parse_sequence_length=10,
-        use_v2_feature_names=use_v2_feature_names,
-        output_encoder_outputs=output_encoder_outputs)
-    export_module = serving_modules.Tagging(params=params, model=model)
-    functions = export_module.get_inference_signatures({
-        "serve": "serving_default",
-        "serve_examples": "serving_examples"
-    })
-    dummy_ids = tf.ones((10, 10), dtype=tf.int32)
-    outputs = functions["serving_default"](
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    self.assertEqual(outputs["logits"].shape, (10, 10, num_classes))
-    if output_encoder_outputs:
-      self.assertEqual(outputs["encoder_outputs"].shape, (10, 10, hidden_size))
-
-    dummy_ids = tf.ones((10,), dtype=tf.int32)
-    examples = _create_fake_serialized_examples({
-        input_word_ids_field: dummy_ids,
-        "input_mask": dummy_ids,
-        input_type_ids_field: dummy_ids
-    })
-    outputs = functions["serving_examples"](examples)
-    self.assertEqual(outputs["logits"].shape, (10, 10, num_classes))
-    if output_encoder_outputs:
-      self.assertEqual(outputs["encoder_outputs"].shape, (10, 10, hidden_size))
-
-    with self.assertRaises(ValueError):
-      _ = export_module.get_inference_signatures({"foo": None})
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/electra_task.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/electra_task.py
deleted file mode 100644
index 8680e3654d9cddac00e1408b47cbe76d03bf4d21..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/electra_task.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""ELECTRA pretraining task (Joint Masked LM and Replaced Token Detection)."""
-
-import dataclasses
-import tensorflow as tf
-
-from official.core import base_task
-from official.core import config_definitions as cfg
-from official.core import task_factory
-from official.modeling import tf_utils
-from official.nlp.configs import bert
-from official.nlp.configs import electra
-from official.nlp.configs import encoders
-from official.nlp.data import pretrain_dataloader
-from official.nlp.modeling import layers
-from official.nlp.modeling import models
-
-
-@dataclasses.dataclass
-class ElectraPretrainConfig(cfg.TaskConfig):
-  """The model config."""
-  model: electra.ElectraPretrainerConfig = electra.ElectraPretrainerConfig(
-      cls_heads=[
-          bert.ClsHeadConfig(
-              inner_dim=768,
-              num_classes=2,
-              dropout_rate=0.1,
-              name='next_sentence')
-      ])
-  train_data: cfg.DataConfig = cfg.DataConfig()
-  validation_data: cfg.DataConfig = cfg.DataConfig()
-
-
-def _build_pretrainer(
-    config: electra.ElectraPretrainerConfig) -> models.ElectraPretrainer:
-  """Instantiates ElectraPretrainer from the config."""
-  generator_encoder_cfg = config.generator_encoder
-  discriminator_encoder_cfg = config.discriminator_encoder
-  # Copy discriminator's embeddings to generator for easier model serialization.
-  discriminator_network = encoders.build_encoder(discriminator_encoder_cfg)
-  if config.tie_embeddings:
-    embedding_layer = discriminator_network.get_embedding_layer()
-    generator_network = encoders.build_encoder(
-        generator_encoder_cfg, embedding_layer=embedding_layer)
-  else:
-    generator_network = encoders.build_encoder(generator_encoder_cfg)
-
-  generator_encoder_cfg = generator_encoder_cfg.get()
-  return models.ElectraPretrainer(
-      generator_network=generator_network,
-      discriminator_network=discriminator_network,
-      vocab_size=generator_encoder_cfg.vocab_size,
-      num_classes=config.num_classes,
-      sequence_length=config.sequence_length,
-      num_token_predictions=config.num_masked_tokens,
-      mlm_activation=tf_utils.get_activation(
-          generator_encoder_cfg.hidden_activation),
-      mlm_initializer=tf.keras.initializers.TruncatedNormal(
-          stddev=generator_encoder_cfg.initializer_range),
-      classification_heads=[
-          layers.ClassificationHead(**cfg.as_dict()) for cfg in config.cls_heads
-      ],
-      disallow_correct=config.disallow_correct)
-
-
-@task_factory.register_task_cls(ElectraPretrainConfig)
-class ElectraPretrainTask(base_task.Task):
-  """ELECTRA Pretrain Task (Masked LM + Replaced Token Detection)."""
-
-  def build_model(self):
-    return _build_pretrainer(self.task_config.model)
-
-  def build_losses(self,
-                   labels,
-                   model_outputs,
-                   metrics,
-                   aux_losses=None) -> tf.Tensor:
-    metrics = dict([(metric.name, metric) for metric in metrics])
-
-    # generator lm and (optional) nsp loss.
-    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
-        labels['masked_lm_ids'],
-        tf.cast(model_outputs['lm_outputs'], tf.float32),
-        from_logits=True)
-    lm_label_weights = labels['masked_lm_weights']
-    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
-    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
-    mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
-    metrics['lm_example_loss'].update_state(mlm_loss)
-    if 'next_sentence_labels' in labels:
-      sentence_labels = labels['next_sentence_labels']
-      sentence_outputs = tf.cast(
-          model_outputs['sentence_outputs'], dtype=tf.float32)
-      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
-          sentence_labels, sentence_outputs, from_logits=True)
-      metrics['next_sentence_loss'].update_state(sentence_loss)
-      total_loss = mlm_loss + sentence_loss
-    else:
-      total_loss = mlm_loss
-
-    # discriminator replaced token detection (rtd) loss.
-    rtd_logits = model_outputs['disc_logits']
-    rtd_labels = tf.cast(model_outputs['disc_label'], tf.float32)
-    input_mask = tf.cast(labels['input_mask'], tf.float32)
-    rtd_ind_loss = tf.nn.sigmoid_cross_entropy_with_logits(
-        logits=rtd_logits, labels=rtd_labels)
-    rtd_numerator = tf.reduce_sum(input_mask * rtd_ind_loss)
-    rtd_denominator = tf.reduce_sum(input_mask)
-    rtd_loss = tf.math.divide_no_nan(rtd_numerator, rtd_denominator)
-    metrics['discriminator_loss'].update_state(rtd_loss)
-    total_loss = total_loss + \
-        self.task_config.model.discriminator_loss_weight * rtd_loss
-
-    if aux_losses:
-      total_loss += tf.add_n(aux_losses)
-
-    metrics['total_loss'].update_state(total_loss)
-    return total_loss
-
-  def build_inputs(self, params, input_context=None):
-    """Returns tf.data.Dataset for pretraining."""
-    if params.input_path == 'dummy':
-
-      def dummy_data(_):
-        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
-        dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32)
-        return dict(
-            input_word_ids=dummy_ids,
-            input_mask=dummy_ids,
-            input_type_ids=dummy_ids,
-            masked_lm_positions=dummy_lm,
-            masked_lm_ids=dummy_lm,
-            masked_lm_weights=tf.cast(dummy_lm, dtype=tf.float32),
-            next_sentence_labels=tf.zeros((1, 1), dtype=tf.int32))
-
-      dataset = tf.data.Dataset.range(1)
-      dataset = dataset.repeat()
-      dataset = dataset.map(
-          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-      return dataset
-
-    return pretrain_dataloader.BertPretrainDataLoader(params).load(
-        input_context)
-
-  def build_metrics(self, training=None):
-    del training
-    metrics = [
-        tf.keras.metrics.SparseCategoricalAccuracy(name='masked_lm_accuracy'),
-        tf.keras.metrics.Mean(name='lm_example_loss'),
-        tf.keras.metrics.SparseCategoricalAccuracy(
-            name='discriminator_accuracy'),
-    ]
-    if self.task_config.train_data.use_next_sentence_label:
-      metrics.append(
-          tf.keras.metrics.SparseCategoricalAccuracy(
-              name='next_sentence_accuracy'))
-      metrics.append(tf.keras.metrics.Mean(name='next_sentence_loss'))
-
-    metrics.append(tf.keras.metrics.Mean(name='discriminator_loss'))
-    metrics.append(tf.keras.metrics.Mean(name='total_loss'))
-
-    return metrics
-
-  def process_metrics(self, metrics, labels, model_outputs):
-    metrics = dict([(metric.name, metric) for metric in metrics])
-    if 'masked_lm_accuracy' in metrics:
-      metrics['masked_lm_accuracy'].update_state(labels['masked_lm_ids'],
-                                                 model_outputs['lm_outputs'],
-                                                 labels['masked_lm_weights'])
-    if 'next_sentence_accuracy' in metrics:
-      metrics['next_sentence_accuracy'].update_state(
-          labels['next_sentence_labels'], model_outputs['sentence_outputs'])
-    if 'discriminator_accuracy' in metrics:
-      disc_logits_expanded = tf.expand_dims(model_outputs['disc_logits'], -1)
-      discrim_full_logits = tf.concat(
-          [-1.0 * disc_logits_expanded, disc_logits_expanded], -1)
-      metrics['discriminator_accuracy'].update_state(
-          model_outputs['disc_label'], discrim_full_logits,
-          labels['input_mask'])
-
-  def train_step(self, inputs, model: tf.keras.Model,
-                 optimizer: tf.keras.optimizers.Optimizer, metrics):
-    """Does forward and backward.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the model, forward pass definition.
-      optimizer: the optimizer for this training step.
-      metrics: a nested structure of metrics objects.
-
-    Returns:
-      A dictionary of logs.
-    """
-    with tf.GradientTape() as tape:
-      outputs = model(inputs, training=True)
-      # Computes per-replica loss.
-      loss = self.build_losses(
-          labels=inputs,
-          model_outputs=outputs,
-          metrics=metrics,
-          aux_losses=model.losses)
-      # Scales loss as the default gradients allreduce performs sum inside the
-      # optimizer.
-      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
-    tvars = model.trainable_variables
-    grads = tape.gradient(scaled_loss, tvars)
-    optimizer.apply_gradients(list(zip(grads, tvars)))
-    self.process_metrics(metrics, inputs, outputs)
-    return {self.loss: loss}
-
-  def validation_step(self, inputs, model: tf.keras.Model, metrics):
-    """Validatation step.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the keras.Model.
-      metrics: a nested structure of metrics objects.
-
-    Returns:
-      A dictionary of logs.
-    """
-    outputs = model(inputs, training=False)
-    loss = self.build_losses(
-        labels=inputs,
-        model_outputs=outputs,
-        metrics=metrics,
-        aux_losses=model.losses)
-    self.process_metrics(metrics, inputs, outputs)
-    return {self.loss: loss}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/electra_task_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/electra_task_test.py
deleted file mode 100644
index e6f9e7c4c392730227da82def5ba519049e758bc..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/electra_task_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.tasks.electra_task."""
-
-import tensorflow as tf
-
-from official.nlp.configs import bert
-from official.nlp.configs import electra
-from official.nlp.configs import encoders
-from official.nlp.data import pretrain_dataloader
-from official.nlp.tasks import electra_task
-
-
-class ElectraPretrainTaskTest(tf.test.TestCase):
-
-  def test_task(self):
-    config = electra_task.ElectraPretrainConfig(
-        model=electra.ElectraPretrainerConfig(
-            generator_encoder=encoders.EncoderConfig(
-                bert=encoders.BertEncoderConfig(vocab_size=30522,
-                                                num_layers=1)),
-            discriminator_encoder=encoders.EncoderConfig(
-                bert=encoders.BertEncoderConfig(vocab_size=30522,
-                                                num_layers=1)),
-            num_masked_tokens=20,
-            sequence_length=128,
-            cls_heads=[
-                bert.ClsHeadConfig(
-                    inner_dim=10, num_classes=2, name="next_sentence")
-            ]),
-        train_data=pretrain_dataloader.BertPretrainDataConfig(
-            input_path="dummy",
-            max_predictions_per_seq=20,
-            seq_length=128,
-            global_batch_size=1))
-    task = electra_task.ElectraPretrainTask(config)
-    model = task.build_model()
-    metrics = task.build_metrics()
-    dataset = task.build_inputs(config.train_data)
-
-    iterator = iter(dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(iterator), model, optimizer, metrics=metrics)
-    task.validation_step(next(iterator), model, metrics=metrics)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/masked_lm.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/masked_lm.py
deleted file mode 100644
index aae14bba94d1c066f887f4ec65bf1948ab787b57..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/masked_lm.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Masked language task."""
-
-import dataclasses
-import tensorflow as tf
-
-from official.core import base_task
-from official.core import config_definitions as cfg
-from official.core import task_factory
-from official.modeling import tf_utils
-from official.nlp.configs import bert
-from official.nlp.configs import encoders
-from official.nlp.data import data_loader_factory
-from official.nlp.modeling import layers
-from official.nlp.modeling import models
-
-
-@dataclasses.dataclass
-class MaskedLMConfig(cfg.TaskConfig):
-  """The model config."""
-  model: bert.PretrainerConfig = bert.PretrainerConfig(cls_heads=[
-      bert.ClsHeadConfig(
-          inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence')
-  ])
-  # TODO(b/154564893): Mathematically, scale_loss should be True.
-  # However, it works better with scale_loss being False.
-  scale_loss: bool = False
-  train_data: cfg.DataConfig = cfg.DataConfig()
-  validation_data: cfg.DataConfig = cfg.DataConfig()
-
-
-@task_factory.register_task_cls(MaskedLMConfig)
-class MaskedLMTask(base_task.Task):
-  """Task object for Mask language modeling."""
-
-  def _build_encoder(self, encoder_cfg):
-    return encoders.build_encoder(encoder_cfg)
-
-  def build_model(self, params=None):
-    config = params or self.task_config.model
-    encoder_cfg = config.encoder
-    encoder_network = self._build_encoder(encoder_cfg)
-    cls_heads = [
-        layers.ClassificationHead(**cfg.as_dict()) for cfg in config.cls_heads
-    ] if config.cls_heads else []
-    return models.BertPretrainerV2(
-        mlm_activation=tf_utils.get_activation(config.mlm_activation),
-        mlm_initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=config.mlm_initializer_range),
-        encoder_network=encoder_network,
-        classification_heads=cls_heads)
-
-  def build_losses(self,
-                   labels,
-                   model_outputs,
-                   metrics,
-                   aux_losses=None) -> tf.Tensor:
-    with tf.name_scope('MaskedLMTask/losses'):
-      metrics = dict([(metric.name, metric) for metric in metrics])
-      lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
-          labels['masked_lm_ids'],
-          tf.cast(model_outputs['mlm_logits'], tf.float32),
-          from_logits=True)
-      lm_label_weights = labels['masked_lm_weights']
-      lm_numerator_loss = tf.reduce_sum(lm_prediction_losses *
-                                        lm_label_weights)
-      lm_denominator_loss = tf.reduce_sum(lm_label_weights)
-      mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
-      metrics['lm_example_loss'].update_state(mlm_loss)
-      if 'next_sentence_labels' in labels:
-        sentence_labels = labels['next_sentence_labels']
-        sentence_outputs = tf.cast(
-            model_outputs['next_sentence'], dtype=tf.float32)
-        sentence_loss = tf.reduce_mean(
-            tf.keras.losses.sparse_categorical_crossentropy(
-                sentence_labels, sentence_outputs, from_logits=True))
-        metrics['next_sentence_loss'].update_state(sentence_loss)
-        total_loss = mlm_loss + sentence_loss
-      else:
-        total_loss = mlm_loss
-
-      if aux_losses:
-        total_loss += tf.add_n(aux_losses)
-      return total_loss
-
-  def build_inputs(self, params, input_context=None):
-    """Returns tf.data.Dataset for pretraining."""
-    if params.input_path == 'dummy':
-
-      def dummy_data(_):
-        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
-        dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32)
-        return dict(
-            input_word_ids=dummy_ids,
-            input_mask=dummy_ids,
-            input_type_ids=dummy_ids,
-            masked_lm_positions=dummy_lm,
-            masked_lm_ids=dummy_lm,
-            masked_lm_weights=tf.cast(dummy_lm, dtype=tf.float32),
-            next_sentence_labels=tf.zeros((1, 1), dtype=tf.int32))
-
-      dataset = tf.data.Dataset.range(1)
-      dataset = dataset.repeat()
-      dataset = dataset.map(
-          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-      return dataset
-
-    return data_loader_factory.get_data_loader(params).load(input_context)
-
-  def build_metrics(self, training=None):
-    del training
-    metrics = [
-        tf.keras.metrics.SparseCategoricalAccuracy(name='masked_lm_accuracy'),
-        tf.keras.metrics.Mean(name='lm_example_loss')
-    ]
-    # TODO(hongkuny): rethink how to manage metrics creation with heads.
-    if self.task_config.train_data.use_next_sentence_label:
-      metrics.append(
-          tf.keras.metrics.SparseCategoricalAccuracy(
-              name='next_sentence_accuracy'))
-      metrics.append(tf.keras.metrics.Mean(name='next_sentence_loss'))
-    return metrics
-
-  def process_metrics(self, metrics, labels, model_outputs):
-    with tf.name_scope('MaskedLMTask/process_metrics'):
-      metrics = dict([(metric.name, metric) for metric in metrics])
-      if 'masked_lm_accuracy' in metrics:
-        metrics['masked_lm_accuracy'].update_state(
-            labels['masked_lm_ids'], model_outputs['mlm_logits'],
-            labels['masked_lm_weights'])
-      if 'next_sentence_accuracy' in metrics:
-        metrics['next_sentence_accuracy'].update_state(
-            labels['next_sentence_labels'], model_outputs['next_sentence'])
-
-  def train_step(self, inputs, model: tf.keras.Model,
-                 optimizer: tf.keras.optimizers.Optimizer, metrics):
-    """Does forward and backward.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the model, forward pass definition.
-      optimizer: the optimizer for this training step.
-      metrics: a nested structure of metrics objects.
-
-    Returns:
-      A dictionary of logs.
-    """
-    with tf.GradientTape() as tape:
-      outputs = model(inputs, training=True)
-      # Computes per-replica loss.
-      loss = self.build_losses(
-          labels=inputs,
-          model_outputs=outputs,
-          metrics=metrics,
-          aux_losses=model.losses)
-      if self.task_config.scale_loss:
-        # Scales loss as the default gradients allreduce performs sum inside the
-        # optimizer.
-        scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
-    tvars = model.trainable_variables
-    if self.task_config.scale_loss:
-      grads = tape.gradient(scaled_loss, tvars)
-    else:
-      grads = tape.gradient(loss, tvars)
-    optimizer.apply_gradients(list(zip(grads, tvars)))
-    self.process_metrics(metrics, inputs, outputs)
-    return {self.loss: loss}
-
-  def validation_step(self, inputs, model: tf.keras.Model, metrics):
-    """Validatation step.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the keras.Model.
-      metrics: a nested structure of metrics objects.
-
-    Returns:
-      A dictionary of logs.
-    """
-    outputs = self.inference_step(inputs, model)
-    loss = self.build_losses(
-        labels=inputs,
-        model_outputs=outputs,
-        metrics=metrics,
-        aux_losses=model.losses)
-    self.process_metrics(metrics, inputs, outputs)
-    return {self.loss: loss}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/masked_lm_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/masked_lm_test.py
deleted file mode 100644
index 8c726f4210ab35327b3bd815ea9ebab063bce63a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/masked_lm_test.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.tasks.masked_lm."""
-
-import tensorflow as tf
-
-from official.nlp.configs import bert
-from official.nlp.configs import encoders
-from official.nlp.data import pretrain_dataloader
-from official.nlp.tasks import masked_lm
-
-
-class MLMTaskTest(tf.test.TestCase):
-
-  def test_task(self):
-    config = masked_lm.MaskedLMConfig(
-        init_checkpoint=self.get_temp_dir(),
-        scale_loss=True,
-        model=bert.PretrainerConfig(
-            encoder=encoders.EncoderConfig(
-                bert=encoders.BertEncoderConfig(vocab_size=30522,
-                                                num_layers=1)),
-            cls_heads=[
-                bert.ClsHeadConfig(
-                    inner_dim=10, num_classes=2, name="next_sentence")
-            ]),
-        train_data=pretrain_dataloader.BertPretrainDataConfig(
-            input_path="dummy",
-            max_predictions_per_seq=20,
-            seq_length=128,
-            global_batch_size=1))
-    task = masked_lm.MaskedLMTask(config)
-    model = task.build_model()
-    metrics = task.build_metrics()
-    dataset = task.build_inputs(config.train_data)
-
-    iterator = iter(dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(iterator), model, optimizer, metrics=metrics)
-    task.validation_step(next(iterator), model, metrics=metrics)
-
-    # Saves a checkpoint.
-    ckpt = tf.train.Checkpoint(model=model, **model.checkpoint_items)
-    ckpt.save(config.init_checkpoint)
-    task.initialize(model)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/question_answering.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/question_answering.py
deleted file mode 100644
index c9b4a275aecbabf5ceb84e51f08a6acc3be35e91..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/question_answering.py
+++ /dev/null
@@ -1,514 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Question answering task."""
-import functools
-import json
-import os
-from typing import List, Optional
-
-from absl import logging
-import dataclasses
-import orbit
-import tensorflow as tf
-
-from official.core import base_task
-from official.core import config_definitions as cfg
-from official.core import task_factory
-from official.modeling.hyperparams import base_config
-from official.nlp.bert import squad_evaluate_v1_1
-from official.nlp.bert import squad_evaluate_v2_0
-from official.nlp.bert import tokenization
-from official.nlp.configs import encoders
-from official.nlp.data import data_loader_factory
-from official.nlp.data import squad_lib as squad_lib_wp
-from official.nlp.data import squad_lib_sp
-from official.nlp.modeling import models
-from official.nlp.tasks import utils
-
-
-@dataclasses.dataclass
-class ModelConfig(base_config.Config):
-  """A base span labeler configuration."""
-  encoder: encoders.EncoderConfig = encoders.EncoderConfig()
-
-
-@dataclasses.dataclass
-class QuestionAnsweringConfig(cfg.TaskConfig):
-  """The model config."""
-  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
-  init_checkpoint: str = ''
-  hub_module_url: str = ''
-  n_best_size: int = 20
-  max_answer_length: int = 30
-  null_score_diff_threshold: float = 0.0
-  model: ModelConfig = ModelConfig()
-  train_data: cfg.DataConfig = cfg.DataConfig()
-  validation_data: cfg.DataConfig = cfg.DataConfig()
-
-
-@dataclasses.dataclass
-class RawAggregatedResult:
-  """Raw representation for SQuAD predictions."""
-  unique_id: int
-  start_logits: List[float]
-  end_logits: List[float]
-  start_indexes: Optional[List[int]] = None
-  end_indexes: Optional[List[int]] = None
-  class_logits: Optional[float] = None
-
-
-@task_factory.register_task_cls(QuestionAnsweringConfig)
-class QuestionAnsweringTask(base_task.Task):
-  """Task object for question answering."""
-
-  def __init__(self, params: cfg.TaskConfig, logging_dir=None, name=None):
-    super().__init__(params, logging_dir, name=name)
-
-    if params.validation_data is None:
-      return
-
-    if params.validation_data.tokenization == 'WordPiece':
-      self.squad_lib = squad_lib_wp
-    elif params.validation_data.tokenization == 'SentencePiece':
-      self.squad_lib = squad_lib_sp
-    else:
-      raise ValueError('Unsupported tokenization method: {}'.format(
-          params.validation_data.tokenization))
-
-    if params.validation_data.input_path:
-      self._tf_record_input_path, self._eval_examples, self._eval_features = (
-          self._preprocess_eval_data(params.validation_data))
-
-  def set_preprocessed_eval_input_path(self, eval_input_path):
-    """Sets the path to the preprocessed eval data."""
-    self._tf_record_input_path = eval_input_path
-
-  def build_model(self):
-    if self.task_config.hub_module_url and self.task_config.init_checkpoint:
-      raise ValueError('At most one of `hub_module_url` and '
-                       '`init_checkpoint` can be specified.')
-    if self.task_config.hub_module_url:
-      encoder_network = utils.get_encoder_from_hub(
-          self.task_config.hub_module_url)
-    else:
-      encoder_network = encoders.build_encoder(self.task_config.model.encoder)
-    encoder_cfg = self.task_config.model.encoder.get()
-    return models.BertSpanLabeler(
-        network=encoder_network,
-        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=encoder_cfg.initializer_range))
-
-  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    start_positions = labels['start_positions']
-    end_positions = labels['end_positions']
-    start_logits, end_logits = model_outputs
-
-    start_loss = tf.keras.losses.sparse_categorical_crossentropy(
-        start_positions,
-        tf.cast(start_logits, dtype=tf.float32),
-        from_logits=True)
-    end_loss = tf.keras.losses.sparse_categorical_crossentropy(
-        end_positions, tf.cast(end_logits, dtype=tf.float32), from_logits=True)
-
-    loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
-    return loss
-
-  def _preprocess_eval_data(self, params):
-    eval_examples = self.squad_lib.read_squad_examples(
-        input_file=params.input_path,
-        is_training=False,
-        version_2_with_negative=params.version_2_with_negative)
-
-    temp_file_path = params.input_preprocessed_data_path or self.logging_dir
-    if not temp_file_path:
-      raise ValueError('You must specify a temporary directory, either in '
-                       'params.input_preprocessed_data_path or logging_dir to '
-                       'store intermediate evaluation TFRecord data.')
-    eval_writer = self.squad_lib.FeatureWriter(
-        filename=os.path.join(temp_file_path, 'eval.tf_record'),
-        is_training=False)
-    eval_features = []
-
-    def _append_feature(feature, is_padding):
-      if not is_padding:
-        eval_features.append(feature)
-      eval_writer.process_feature(feature)
-
-    # XLNet preprocesses SQuAD examples in a P, Q, class order whereas
-    # BERT preprocesses in a class, Q, P order.
-    xlnet_ordering = self.task_config.model.encoder.type == 'xlnet'
-    kwargs = dict(
-        examples=eval_examples,
-        max_seq_length=params.seq_length,
-        doc_stride=params.doc_stride,
-        max_query_length=params.query_length,
-        is_training=False,
-        output_fn=_append_feature,
-        batch_size=params.global_batch_size,
-        xlnet_format=xlnet_ordering)
-
-    if params.tokenization == 'SentencePiece':
-      # squad_lib_sp requires one more argument 'do_lower_case'.
-      kwargs['do_lower_case'] = params.do_lower_case
-      kwargs['tokenizer'] = tokenization.FullSentencePieceTokenizer(
-          sp_model_file=params.vocab_file)
-    elif params.tokenization == 'WordPiece':
-      kwargs['tokenizer'] = tokenization.FullTokenizer(
-          vocab_file=params.vocab_file, do_lower_case=params.do_lower_case)
-    else:
-      raise ValueError('Unexpected tokenization: %s' % params.tokenization)
-
-    eval_dataset_size = self.squad_lib.convert_examples_to_features(**kwargs)
-    eval_writer.close()
-
-    logging.info('***** Evaluation input stats *****')
-    logging.info('  Num orig examples = %d', len(eval_examples))
-    logging.info('  Num split examples = %d', len(eval_features))
-    logging.info('  Batch size = %d', params.global_batch_size)
-    logging.info('  Dataset size = %d', eval_dataset_size)
-
-    return eval_writer.filename, eval_examples, eval_features
-
-  def _dummy_data(self, params, _):
-    """Returns dummy data."""
-    dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
-    x = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    y = dict(
-        start_positions=tf.constant(0, dtype=tf.int32),
-        end_positions=tf.constant(1, dtype=tf.int32),
-        is_impossible=tf.constant(0, dtype=tf.int32))
-    return x, y
-
-  def build_inputs(self, params, input_context=None):
-    """Returns tf.data.Dataset for sentence_prediction task."""
-    if params.input_path == 'dummy':
-      dataset = tf.data.Dataset.range(1)
-      dataset = dataset.repeat()
-      dummy_data = functools.partial(self._dummy_data, params)
-      dataset = dataset.map(
-          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-      return dataset
-
-    if params.is_training:
-      dataloader_params = params
-    else:
-      input_path = self._tf_record_input_path
-      dataloader_params = params.replace(input_path=input_path)
-
-    return data_loader_factory.get_data_loader(dataloader_params).load(
-        input_context)
-
-  def build_metrics(self, training=None):
-    if not training:
-      # We cannot compute start/end_position_accuracy because start/end_position
-      # labels are not available in the validation dataset (b/173794928).
-      return []
-    # TODO(lehou): a list of metrics doesn't work the same as in compile/fit.
-    metrics = [
-        tf.keras.metrics.SparseCategoricalAccuracy(
-            name='start_position_accuracy'),
-        tf.keras.metrics.SparseCategoricalAccuracy(
-            name='end_position_accuracy'),
-    ]
-    return metrics
-
-  def process_metrics(self, metrics, labels, model_outputs):
-    metrics = dict([(metric.name, metric) for metric in metrics])
-    start_logits, end_logits = model_outputs
-    metrics['start_position_accuracy'].update_state(labels['start_positions'],
-                                                    start_logits)
-    metrics['end_position_accuracy'].update_state(labels['end_positions'],
-                                                  end_logits)
-
-  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
-    start_logits, end_logits = model_outputs
-    compiled_metrics.update_state(
-        y_true=labels,  # labels has keys 'start_positions' and 'end_positions'.
-        y_pred={
-            'start_positions': start_logits,
-            'end_positions': end_logits
-        })
-
-  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
-    features, _ = inputs
-    unique_ids = features.pop('unique_ids')
-    model_outputs = self.inference_step(features, model)
-    start_logits, end_logits = model_outputs
-    # We cannot compute validation_loss here, because start/end_position
-    # labels are not available in the validation dataset (b/173794928).
-    logs = {
-        'unique_ids': unique_ids,
-        'start_logits': start_logits,
-        'end_logits': end_logits,
-    }
-    return logs
-
-  def aggregate_logs(self, state=None, step_outputs=None):
-    assert step_outputs is not None, 'Got no logs from self.validation_step.'
-    if state is None:
-      state = []
-
-    for outputs in zip(step_outputs['unique_ids'],
-                       step_outputs['start_logits'],
-                       step_outputs['end_logits']):
-      numpy_values = [
-          output.numpy() for output in outputs if output is not None]
-
-      for values in zip(*numpy_values):
-        state.append(RawAggregatedResult(
-            unique_id=values[0],
-            start_logits=values[1],
-            end_logits=values[2]))
-    return state
-
-  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
-    all_predictions, _, scores_diff = (
-        self.squad_lib.postprocess_output(
-            self._eval_examples,
-            self._eval_features,
-            aggregated_logs,
-            self.task_config.n_best_size,
-            self.task_config.max_answer_length,
-            self.task_config.validation_data.do_lower_case,
-            version_2_with_negative=(
-                self.task_config.validation_data.version_2_with_negative),
-            null_score_diff_threshold=(
-                self.task_config.null_score_diff_threshold),
-            xlnet_format=self.task_config.validation_data.xlnet_format,
-            verbose=False))
-
-    with tf.io.gfile.GFile(self.task_config.validation_data.input_path,
-                           'r') as reader:
-      dataset_json = json.load(reader)
-      pred_dataset = dataset_json['data']
-    if self.task_config.validation_data.version_2_with_negative:
-      eval_metrics = squad_evaluate_v2_0.evaluate(pred_dataset, all_predictions,
-                                                  scores_diff)
-      eval_metrics = {
-          'exact_match': eval_metrics['final_exact'],
-          'exact_match_threshold': eval_metrics['final_exact_thresh'],
-          'final_f1': eval_metrics['final_f1'] / 100.0,  # scale back to [0, 1].
-          'f1_threshold': eval_metrics['final_f1_thresh'],
-          'has_answer_exact_match': eval_metrics['HasAns_exact'],
-          'has_answer_f1': eval_metrics['HasAns_f1']
-      }
-    else:
-      eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions)
-      eval_metrics = {
-          'exact_match': eval_metrics['exact_match'],
-          'final_f1': eval_metrics['final_f1']
-      }
-    return eval_metrics
-
-
-@dataclasses.dataclass
-class XLNetQuestionAnsweringConfig(QuestionAnsweringConfig):
-  """The config for the XLNet variation of QuestionAnswering."""
-  pass
-
-
-@task_factory.register_task_cls(XLNetQuestionAnsweringConfig)
-class XLNetQuestionAnsweringTask(QuestionAnsweringTask):
-  """XLNet variant of the Question Answering Task.
-
-  The main differences include:
-    - The encoder is an `XLNetBase` class.
-    - The `SpanLabeling` head is an instance of `XLNetSpanLabeling` which
-      predicts start/end positions and impossibility score. During inference,
-      it predicts the top N scores and indexes.
-  """
-
-  def build_model(self):
-    if self.task_config.hub_module_url and self.task_config.init_checkpoint:
-      raise ValueError('At most one of `hub_module_url` and '
-                       '`init_checkpoint` can be specified.')
-    if self.task_config.hub_module_url:
-      encoder_network = utils.get_encoder_from_hub(
-          self.task_config.hub_module_url)
-    else:
-      encoder_network = encoders.build_encoder(self.task_config.model.encoder)
-    encoder_cfg = self.task_config.model.encoder.get()
-    return models.XLNetSpanLabeler(
-        network=encoder_network,
-        start_n_top=self.task_config.n_best_size,
-        end_n_top=self.task_config.n_best_size,
-        initializer=tf.keras.initializers.RandomNormal(
-            stddev=encoder_cfg.initializer_range))
-
-  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    start_positions = labels['start_positions']
-    end_positions = labels['end_positions']
-    is_impossible = labels['is_impossible']
-    is_impossible = tf.cast(tf.reshape(is_impossible, [-1]), tf.float32)
-
-    start_logits = model_outputs['start_logits']
-    end_logits = model_outputs['end_logits']
-    class_logits = model_outputs['class_logits']
-
-    start_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        start_positions, start_logits)
-    end_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
-        end_positions, end_logits)
-    is_impossible_loss = tf.keras.losses.binary_crossentropy(
-        is_impossible, class_logits, from_logits=True)
-
-    loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
-    loss += tf.reduce_mean(is_impossible_loss) / 2
-    return loss
-
-  def process_metrics(self, metrics, labels, model_outputs):
-    metrics = dict([(metric.name, metric) for metric in metrics])
-    start_logits = model_outputs['start_logits']
-    end_logits = model_outputs['end_logits']
-    metrics['start_position_accuracy'].update_state(labels['start_positions'],
-                                                    start_logits)
-    metrics['end_position_accuracy'].update_state(labels['end_positions'],
-                                                  end_logits)
-
-  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
-    start_logits = model_outputs['start_logits']
-    end_logits = model_outputs['end_logits']
-    compiled_metrics.update_state(
-        y_true=labels,  # labels has keys 'start_positions' and 'end_positions'.
-        y_pred={
-            'start_positions': start_logits,
-            'end_positions': end_logits,
-        })
-
-  def _dummy_data(self, params, _):
-    """Returns dummy data."""
-    dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
-    zero = tf.constant(0, dtype=tf.int32)
-    x = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids,
-        class_index=zero,
-        is_impossible=zero,
-        paragraph_mask=dummy_ids,
-        start_positions=tf.zeros((1), dtype=tf.int32))
-    y = dict(
-        start_positions=tf.zeros((1), dtype=tf.int32),
-        end_positions=tf.ones((1), dtype=tf.int32),
-        is_impossible=zero)
-    return x, y
-
-  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
-    features, _ = inputs
-    unique_ids = features.pop('unique_ids')
-    model_outputs = self.inference_step(features, model)
-    start_top_predictions = model_outputs['start_top_predictions']
-    end_top_predictions = model_outputs['end_top_predictions']
-    start_indexes = model_outputs['start_top_index']
-    end_indexes = model_outputs['end_top_index']
-    class_logits = model_outputs['class_logits']
-
-    logs = {
-        'unique_ids': unique_ids,
-        'start_top_predictions': start_top_predictions,
-        'end_top_predictions': end_top_predictions,
-        'start_indexes': start_indexes,
-        'end_indexes': end_indexes,
-        'class_logits': class_logits,
-    }
-    return logs
-
-  def aggregate_logs(self, state=None, step_outputs=None):
-    assert step_outputs is not None, 'Got no logs from self.validation_step.'
-    if state is None:
-      state = []
-
-    for outputs in zip(step_outputs['unique_ids'],
-                       step_outputs['start_top_predictions'],
-                       step_outputs['end_top_predictions'],
-                       step_outputs['start_indexes'],
-                       step_outputs['end_indexes'],
-                       step_outputs['class_logits']):
-      numpy_values = [
-          output.numpy() for output in outputs]
-
-      for (unique_id, start_top_predictions, end_top_predictions, start_indexes,
-           end_indexes, class_logits) in zip(*numpy_values):
-        state.append(RawAggregatedResult(
-            unique_id=unique_id,
-            start_logits=start_top_predictions.tolist(),
-            end_logits=end_top_predictions.tolist(),
-            start_indexes=start_indexes.tolist(),
-            end_indexes=end_indexes.tolist(),
-            class_logits=class_logits))
-    return state
-
-
-def predict(task: QuestionAnsweringTask, params: cfg.DataConfig,
-            model: tf.keras.Model):
-  """Predicts on the input data.
-
-  Args:
-    task: A `QuestionAnsweringTask` object.
-    params: A `cfg.DataConfig` object.
-    model: A keras.Model.
-
-  Returns:
-    A tuple of `all_predictions`, `all_nbest` and `scores_diff`, which
-      are dict and can be written to json files including prediction json file,
-      nbest json file and null_odds json file.
-  """
-  tf_record_input_path, eval_examples, eval_features = (
-      task._preprocess_eval_data(params))  # pylint: disable=protected-access
-
-  # `tf_record_input_path` will overwrite `params.input_path`,
-  # when `task.buid_inputs()` is called.
-  task.set_preprocessed_eval_input_path(tf_record_input_path)
-
-  def predict_step(inputs):
-    """Replicated prediction calculation."""
-    return task.validation_step(inputs, model)
-
-  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
-                                                 task.build_inputs, params)
-  aggregated_outputs = utils.predict(predict_step, task.aggregate_logs, dataset)
-
-  all_predictions, all_nbest, scores_diff = (
-      task.squad_lib.postprocess_output(
-          eval_examples,
-          eval_features,
-          aggregated_outputs,
-          task.task_config.n_best_size,
-          task.task_config.max_answer_length,
-          task.task_config.validation_data.do_lower_case,
-          version_2_with_negative=(params.version_2_with_negative),
-          null_score_diff_threshold=task.task_config.null_score_diff_threshold,
-          xlnet_format=task.task_config.validation_data.xlnet_format,
-          verbose=False))
-  return all_predictions, all_nbest, scores_diff
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/question_answering_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/question_answering_test.py
deleted file mode 100644
index df35244b9ffd7bbc357c787d850afdf6e4199810..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/question_answering_test.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.tasks.question_answering."""
-import itertools
-import json
-import os
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.nlp.configs import bert
-from official.nlp.configs import encoders
-from official.nlp.data import question_answering_dataloader
-from official.nlp.tasks import masked_lm
-from official.nlp.tasks import question_answering
-
-
-class QuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(QuestionAnsweringTaskTest, self).setUp()
-    self._encoder_config = encoders.EncoderConfig(
-        bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1))
-    self._train_data_config = question_answering_dataloader.QADataConfig(
-        input_path="dummy", seq_length=128, global_batch_size=1)
-
-    val_data = {
-        "version":
-            "1.1",
-        "data": [{
-            "paragraphs": [{
-                "context":
-                    "Sky is blue.",
-                "qas": [{
-                    "question":
-                        "What is blue?",
-                    "id":
-                        "1234",
-                    "answers": [{
-                        "text": "Sky",
-                        "answer_start": 0
-                    }, {
-                        "text": "Sky",
-                        "answer_start": 0
-                    }, {
-                        "text": "Sky",
-                        "answer_start": 0
-                    }]
-                }]
-            }]
-        }]
-    }
-    self._val_input_path = os.path.join(self.get_temp_dir(), "val_data.json")
-    with tf.io.gfile.GFile(self._val_input_path, "w") as writer:
-      writer.write(json.dumps(val_data, indent=4) + "\n")
-
-    self._test_vocab = os.path.join(self.get_temp_dir(), "vocab.txt")
-    with tf.io.gfile.GFile(self._test_vocab, "w") as writer:
-      writer.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\nsky\nis\nblue\n")
-
-  def _get_validation_data_config(self, version_2_with_negative=False):
-    return question_answering_dataloader.QADataConfig(
-        is_training=False,
-        input_path=self._val_input_path,
-        input_preprocessed_data_path=self.get_temp_dir(),
-        seq_length=128,
-        global_batch_size=1,
-        version_2_with_negative=version_2_with_negative,
-        vocab_file=self._test_vocab,
-        tokenization="WordPiece",
-        do_lower_case=True)
-
-  def _run_task(self, config):
-    task = question_answering.QuestionAnsweringTask(config)
-    model = task.build_model()
-    metrics = task.build_metrics()
-    task.initialize(model)
-
-    train_dataset = task.build_inputs(config.train_data)
-    train_iterator = iter(train_dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(train_iterator), model, optimizer, metrics=metrics)
-
-    val_dataset = task.build_inputs(config.validation_data)
-    val_iterator = iter(val_dataset)
-    logs = task.validation_step(next(val_iterator), model, metrics=metrics)
-    # Mock that `logs` is from one replica.
-    logs = {x: (logs[x],) for x in logs}
-    logs = task.aggregate_logs(step_outputs=logs)
-    metrics = task.reduce_aggregated_logs(logs)
-    self.assertIn("final_f1", metrics)
-    model.save(os.path.join(self.get_temp_dir(), "saved_model"))
-
-  @parameterized.parameters(
-      itertools.product(
-          (False, True),
-          ("WordPiece", "SentencePiece"),
-      ))
-  def test_task(self, version_2_with_negative, tokenization):
-    # Saves a checkpoint.
-    pretrain_cfg = bert.PretrainerConfig(
-        encoder=self._encoder_config,
-        cls_heads=[
-            bert.ClsHeadConfig(
-                inner_dim=10, num_classes=3, name="next_sentence")
-        ])
-    pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg)
-    ckpt = tf.train.Checkpoint(
-        model=pretrain_model, **pretrain_model.checkpoint_items)
-    saved_path = ckpt.save(self.get_temp_dir())
-
-    config = question_answering.QuestionAnsweringConfig(
-        init_checkpoint=saved_path,
-        model=question_answering.ModelConfig(encoder=self._encoder_config),
-        train_data=self._train_data_config,
-        validation_data=self._get_validation_data_config(
-            version_2_with_negative))
-    self._run_task(config)
-
-  def _export_bert_tfhub(self):
-    encoder = encoders.build_encoder(
-        encoders.EncoderConfig(
-            bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)))
-    encoder_inputs_dict = {x.name: x for x in encoder.inputs}
-    encoder_output_dict = encoder(encoder_inputs_dict)
-    core_model = tf.keras.Model(
-        inputs=encoder_inputs_dict, outputs=encoder_output_dict)
-    hub_destination = os.path.join(self.get_temp_dir(), "hub")
-    core_model.save(hub_destination, include_optimizer=False, save_format="tf")
-    return hub_destination
-
-  def test_task_with_hub(self):
-    hub_module_url = self._export_bert_tfhub()
-    config = question_answering.QuestionAnsweringConfig(
-        hub_module_url=hub_module_url,
-        model=question_answering.ModelConfig(encoder=self._encoder_config),
-        train_data=self._train_data_config,
-        validation_data=self._get_validation_data_config())
-    self._run_task(config)
-
-  @parameterized.named_parameters(("squad1", False), ("squad2", True))
-  def test_predict(self, version_2_with_negative):
-    validation_data = self._get_validation_data_config(
-        version_2_with_negative=version_2_with_negative)
-
-    config = question_answering.QuestionAnsweringConfig(
-        model=question_answering.ModelConfig(encoder=self._encoder_config),
-        train_data=self._train_data_config,
-        validation_data=validation_data)
-    task = question_answering.QuestionAnsweringTask(config)
-    model = task.build_model()
-
-    all_predictions, all_nbest, scores_diff = question_answering.predict(
-        task, validation_data, model)
-    self.assertLen(all_predictions, 1)
-    self.assertLen(all_nbest, 1)
-    if version_2_with_negative:
-      self.assertLen(scores_diff, 1)
-    else:
-      self.assertEmpty(scores_diff)
-
-
-class XLNetQuestionAnsweringTaskTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(XLNetQuestionAnsweringTaskTest, self).setUp()
-    self._encoder_config = encoders.EncoderConfig(
-        type="xlnet",
-        xlnet=encoders.XLNetEncoderConfig(vocab_size=30522, num_layers=1))
-    self._train_data_config = question_answering_dataloader.QADataConfig(
-        input_path="dummy", seq_length=128,
-        global_batch_size=2, xlnet_format=True)
-
-    val_data = {
-        "version":
-            "2.0",
-        "data": [{
-            "paragraphs": [{
-                "context":
-                    "Sky is blue.",
-                "qas": [{
-                    "question":
-                        "What is blue?",
-                    "id":
-                        "1234",
-                    "answers": [{
-                        "text": "Sky",
-                        "answer_start": 0
-                    }, {
-                        "text": "Sky",
-                        "answer_start": 0
-                    }, {
-                        "text": "Sky",
-                        "answer_start": 0
-                    }]
-                }]
-            }]
-        }]
-    }
-    self._val_input_path = os.path.join(self.get_temp_dir(), "val_data.json")
-    with tf.io.gfile.GFile(self._val_input_path, "w") as writer:
-      writer.write(json.dumps(val_data, indent=4) + "\n")
-
-    self._test_vocab = os.path.join(self.get_temp_dir(), "vocab.txt")
-    with tf.io.gfile.GFile(self._test_vocab, "w") as writer:
-      writer.write("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\nsky\nis\nblue\n")
-
-  def _get_validation_data_config(self):
-    return question_answering_dataloader.QADataConfig(
-        is_training=False,
-        input_path=self._val_input_path,
-        input_preprocessed_data_path=self.get_temp_dir(),
-        seq_length=128,
-        global_batch_size=2,
-        version_2_with_negative=True,
-        vocab_file=self._test_vocab,
-        tokenization="WordPiece",
-        do_lower_case=True,
-        xlnet_format=True)
-
-  def _run_task(self, config):
-    task = question_answering.XLNetQuestionAnsweringTask(config)
-    model = task.build_model()
-    metrics = task.build_metrics()
-    task.initialize(model)
-
-    train_dataset = task.build_inputs(config.train_data)
-    train_iterator = iter(train_dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(train_iterator), model, optimizer, metrics=metrics)
-
-    val_dataset = task.build_inputs(config.validation_data)
-    val_iterator = iter(val_dataset)
-    logs = task.validation_step(next(val_iterator), model, metrics=metrics)
-    # Mock that `logs` is from one replica.
-    logs = {x: (logs[x],) for x in logs}
-    logs = task.aggregate_logs(step_outputs=logs)
-    metrics = task.reduce_aggregated_logs(logs)
-    self.assertIn("final_f1", metrics)
-    self.assertNotIn("loss", metrics)
-
-  def test_task(self):
-    config = question_answering.XLNetQuestionAnsweringConfig(
-        init_checkpoint="",
-        n_best_size=5,
-        model=question_answering.ModelConfig(encoder=self._encoder_config),
-        train_data=self._train_data_config,
-        validation_data=self._get_validation_data_config())
-    self._run_task(config)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/sentence_prediction.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/sentence_prediction.py
deleted file mode 100644
index e9b8b17d3b1d853a7445335c6534bda759f9fa63..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/sentence_prediction.py
+++ /dev/null
@@ -1,309 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Sentence prediction (classification) task."""
-from typing import List, Union, Optional
-
-from absl import logging
-import dataclasses
-import numpy as np
-import orbit
-from scipy import stats
-from sklearn import metrics as sklearn_metrics
-import tensorflow as tf
-
-from official.core import base_task
-from official.core import config_definitions as cfg
-from official.core import task_factory
-from official.modeling import tf_utils
-from official.modeling.hyperparams import base_config
-from official.nlp.configs import encoders
-from official.nlp.data import data_loader_factory
-from official.nlp.modeling import models
-from official.nlp.tasks import utils
-
-METRIC_TYPES = frozenset(
-    ['accuracy', 'matthews_corrcoef', 'pearson_spearman_corr'])
-
-
-@dataclasses.dataclass
-class ModelConfig(base_config.Config):
-  """A classifier/regressor configuration."""
-  num_classes: int = 0
-  use_encoder_pooler: bool = False
-  encoder: encoders.EncoderConfig = encoders.EncoderConfig()
-
-
-@dataclasses.dataclass
-class SentencePredictionConfig(cfg.TaskConfig):
-  """The model config."""
-  # At most one of `init_checkpoint` and `hub_module_url` can
-  # be specified.
-  init_checkpoint: str = ''
-  init_cls_pooler: bool = False
-  hub_module_url: str = ''
-  metric_type: str = 'accuracy'
-  # Defines the concrete model config at instantiation time.
-  model: ModelConfig = ModelConfig()
-  train_data: cfg.DataConfig = cfg.DataConfig()
-  validation_data: cfg.DataConfig = cfg.DataConfig()
-
-
-@task_factory.register_task_cls(SentencePredictionConfig)
-class SentencePredictionTask(base_task.Task):
-  """Task object for sentence_prediction."""
-
-  def __init__(self, params: cfg.TaskConfig, logging_dir=None, name=None):
-    super().__init__(params, logging_dir, name=name)
-    if params.metric_type not in METRIC_TYPES:
-      raise ValueError('Invalid metric_type: {}'.format(params.metric_type))
-    self.metric_type = params.metric_type
-
-  def build_model(self):
-    if self.task_config.hub_module_url and self.task_config.init_checkpoint:
-      raise ValueError('At most one of `hub_module_url` and '
-                       '`init_checkpoint` can be specified.')
-    if self.task_config.hub_module_url:
-      encoder_network = utils.get_encoder_from_hub(
-          self.task_config.hub_module_url)
-    else:
-      encoder_network = encoders.build_encoder(self.task_config.model.encoder)
-    encoder_cfg = self.task_config.model.encoder.get()
-    if self.task_config.model.encoder.type == 'xlnet':
-      return models.XLNetClassifier(
-          network=encoder_network,
-          num_classes=self.task_config.model.num_classes,
-          initializer=tf.keras.initializers.RandomNormal(
-              stddev=encoder_cfg.initializer_range))
-    else:
-      return models.BertClassifier(
-          network=encoder_network,
-          num_classes=self.task_config.model.num_classes,
-          initializer=tf.keras.initializers.TruncatedNormal(
-              stddev=encoder_cfg.initializer_range),
-          use_encoder_pooler=self.task_config.model.use_encoder_pooler)
-
-  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    if self.task_config.model.num_classes == 1:
-      loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
-    else:
-      loss = tf.keras.losses.sparse_categorical_crossentropy(
-          labels, tf.cast(model_outputs, tf.float32), from_logits=True)
-
-    if aux_losses:
-      loss += tf.add_n(aux_losses)
-    return tf_utils.safe_mean(loss)
-
-  def build_inputs(self, params, input_context=None):
-    """Returns tf.data.Dataset for sentence_prediction task."""
-    if params.input_path == 'dummy':
-
-      def dummy_data(_):
-        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
-        x = dict(
-            input_word_ids=dummy_ids,
-            input_mask=dummy_ids,
-            input_type_ids=dummy_ids)
-
-        if self.task_config.model.num_classes == 1:
-          y = tf.zeros((1,), dtype=tf.float32)
-        else:
-          y = tf.zeros((1, 1), dtype=tf.int32)
-        return x, y
-
-      dataset = tf.data.Dataset.range(1)
-      dataset = dataset.repeat()
-      dataset = dataset.map(
-          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-      return dataset
-
-    return data_loader_factory.get_data_loader(params).load(input_context)
-
-  def build_metrics(self, training=None):
-    del training
-    if self.task_config.model.num_classes == 1:
-      metrics = [tf.keras.metrics.MeanSquaredError()]
-    else:
-      metrics = [
-          tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')
-      ]
-    return metrics
-
-  def process_metrics(self, metrics, labels, model_outputs):
-    for metric in metrics:
-      metric.update_state(labels, model_outputs)
-
-  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
-    compiled_metrics.update_state(labels, model_outputs)
-
-  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
-    if self.metric_type == 'accuracy':
-      return super(SentencePredictionTask,
-                   self).validation_step(inputs, model, metrics)
-    features, labels = inputs
-    outputs = self.inference_step(features, model)
-    loss = self.build_losses(
-        labels=labels, model_outputs=outputs, aux_losses=model.losses)
-    logs = {self.loss: loss}
-    if self.metric_type == 'matthews_corrcoef':
-      logs.update({
-          'sentence_prediction':  # Ensure one prediction along batch dimension.
-              tf.expand_dims(tf.math.argmax(outputs, axis=1), axis=1),
-          'labels':
-              labels,
-      })
-    if self.metric_type == 'pearson_spearman_corr':
-      logs.update({
-          'sentence_prediction': outputs,
-          'labels': labels,
-      })
-    return logs
-
-  def aggregate_logs(self, state=None, step_outputs=None):
-    if self.metric_type == 'accuracy':
-      return None
-    if state is None:
-      state = {'sentence_prediction': [], 'labels': []}
-    state['sentence_prediction'].append(
-        np.concatenate([v.numpy() for v in step_outputs['sentence_prediction']],
-                       axis=0))
-    state['labels'].append(
-        np.concatenate([v.numpy() for v in step_outputs['labels']], axis=0))
-    return state
-
-  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
-    if self.metric_type == 'accuracy':
-      return None
-    elif self.metric_type == 'matthews_corrcoef':
-      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
-      preds = np.reshape(preds, -1)
-      labels = np.concatenate(aggregated_logs['labels'], axis=0)
-      labels = np.reshape(labels, -1)
-      return {
-          self.metric_type: sklearn_metrics.matthews_corrcoef(preds, labels)
-      }
-    elif self.metric_type == 'pearson_spearman_corr':
-      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
-      preds = np.reshape(preds, -1)
-      labels = np.concatenate(aggregated_logs['labels'], axis=0)
-      labels = np.reshape(labels, -1)
-      pearson_corr = stats.pearsonr(preds, labels)[0]
-      spearman_corr = stats.spearmanr(preds, labels)[0]
-      corr_metric = (pearson_corr + spearman_corr) / 2
-      return {self.metric_type: corr_metric}
-
-  def initialize(self, model):
-    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
-    ckpt_dir_or_file = self.task_config.init_checkpoint
-    if tf.io.gfile.isdir(ckpt_dir_or_file):
-      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
-    if not ckpt_dir_or_file:
-      return
-
-    pretrain2finetune_mapping = {
-        'encoder': model.checkpoint_items['encoder'],
-    }
-    if self.task_config.init_cls_pooler:
-      # This option is valid when use_encoder_pooler is false.
-      pretrain2finetune_mapping[
-          'next_sentence.pooler_dense'] = model.checkpoint_items[
-              'sentence_prediction.pooler_dense']
-    ckpt = tf.train.Checkpoint(**pretrain2finetune_mapping)
-    status = ckpt.read(ckpt_dir_or_file)
-    status.expect_partial().assert_existing_objects_matched()
-    logging.info('Finished loading pretrained checkpoint from %s',
-                 ckpt_dir_or_file)
-
-
-def predict(task: SentencePredictionTask,
-            params: cfg.DataConfig,
-            model: tf.keras.Model,
-            params_aug: Optional[cfg.DataConfig] = None,
-            test_time_aug_wgt: float = 0.3) -> List[Union[int, float]]:
-  """Predicts on the input data.
-
-  Args:
-    task: A `SentencePredictionTask` object.
-    params: A `cfg.DataConfig` object.
-    model: A keras.Model.
-    params_aug: A `cfg.DataConfig` object for augmented data.
-    test_time_aug_wgt: Test time augmentation weight. The prediction score will
-      use (1. - test_time_aug_wgt) original prediction plus test_time_aug_wgt
-      augmented prediction.
-
-  Returns:
-    A list of predictions with length of `num_examples`. For regression task,
-      each element in the list is the predicted score; for classification task,
-      each element is the predicted class id.
-  """
-
-  def predict_step(inputs):
-    """Replicated prediction calculation."""
-    x, _ = inputs
-    example_id = x.pop('example_id')
-    outputs = task.inference_step(x, model)
-    return dict(example_id=example_id, predictions=outputs)
-
-  def aggregate_fn(state, outputs):
-    """Concatenates model's outputs."""
-    if state is None:
-      state = []
-
-    for per_replica_example_id, per_replica_batch_predictions in zip(
-        outputs['example_id'], outputs['predictions']):
-      state.extend(zip(per_replica_example_id, per_replica_batch_predictions))
-    return state
-
-  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
-                                                 task.build_inputs, params)
-  outputs = utils.predict(predict_step, aggregate_fn, dataset)
-
-  # When running on TPU POD, the order of output cannot be maintained,
-  # so we need to sort by example_id.
-  outputs = sorted(outputs, key=lambda x: x[0])
-  is_regression = task.task_config.model.num_classes == 1
-  if params_aug is not None:
-    dataset_aug = orbit.utils.make_distributed_dataset(
-        tf.distribute.get_strategy(), task.build_inputs, params_aug)
-    outputs_aug = utils.predict(predict_step, aggregate_fn, dataset_aug)
-    outputs_aug = sorted(outputs_aug, key=lambda x: x[0])
-    if is_regression:
-      return [(1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1]
-              for x, y in zip(outputs, outputs_aug)]
-    else:
-      return [
-          tf.argmax(
-              (1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1],
-              axis=-1) for x, y in zip(outputs, outputs_aug)
-      ]
-  if is_regression:
-    return [x[1] for x in outputs]
-  else:
-    return [tf.argmax(x[1], axis=-1) for x in outputs]
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/sentence_prediction_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/sentence_prediction_test.py
deleted file mode 100644
index 6952aece740addd1af1b7c2962a87830e1d8651b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/sentence_prediction_test.py
+++ /dev/null
@@ -1,284 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.tasks.sentence_prediction."""
-import functools
-import os
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.configs import bert
-from official.nlp.configs import encoders
-from official.nlp.data import sentence_prediction_dataloader
-from official.nlp.tasks import masked_lm
-from official.nlp.tasks import sentence_prediction
-
-
-def _create_fake_dataset(output_path, seq_length, num_classes, num_examples):
-  """Creates a fake dataset."""
-  writer = tf.io.TFRecordWriter(output_path)
-
-  def create_int_feature(values):
-    return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-
-  def create_float_feature(values):
-    return tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
-
-  for i in range(num_examples):
-    features = {}
-    input_ids = np.random.randint(100, size=(seq_length))
-    features["input_ids"] = create_int_feature(input_ids)
-    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
-    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
-    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
-    features["example_id"] = create_int_feature([i])
-
-    if num_classes == 1:
-      features["label_ids"] = create_float_feature([np.random.random()])
-    else:
-      features["label_ids"] = create_int_feature(
-          [np.random.random_integers(0, num_classes - 1, size=())])
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-
-
-class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(SentencePredictionTaskTest, self).setUp()
-    self._train_data_config = (
-        sentence_prediction_dataloader.SentencePredictionDataConfig(
-            input_path="dummy", seq_length=128, global_batch_size=1))
-
-  def get_model_config(self, num_classes):
-    return sentence_prediction.ModelConfig(
-        encoder=encoders.EncoderConfig(
-            bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)),
-        num_classes=num_classes)
-
-  def _run_task(self, config):
-    task = sentence_prediction.SentencePredictionTask(config)
-    model = task.build_model()
-    metrics = task.build_metrics()
-
-    strategy = tf.distribute.get_strategy()
-    dataset = strategy.distribute_datasets_from_function(
-        functools.partial(task.build_inputs, config.train_data))
-
-    iterator = iter(dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(iterator), model, optimizer, metrics=metrics)
-    model.save(os.path.join(self.get_temp_dir(), "saved_model"))
-    return task.validation_step(next(iterator), model, metrics=metrics)
-
-  @parameterized.named_parameters(
-      ("init_cls_pooler", True),
-      ("init_encoder", False),
-  )
-  def test_task(self, init_cls_pooler):
-    # Saves a checkpoint.
-    pretrain_cfg = bert.PretrainerConfig(
-        encoder=encoders.EncoderConfig(
-            bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)),
-        cls_heads=[
-            bert.ClsHeadConfig(
-                inner_dim=768, num_classes=2, name="next_sentence")
-        ])
-    pretrain_model = masked_lm.MaskedLMTask(None).build_model(pretrain_cfg)
-    # The model variables will be created after the forward call.
-    _ = pretrain_model(pretrain_model.inputs)
-    ckpt = tf.train.Checkpoint(
-        model=pretrain_model, **pretrain_model.checkpoint_items)
-    init_path = ckpt.save(self.get_temp_dir())
-
-    # Creates the task.
-    config = sentence_prediction.SentencePredictionConfig(
-        init_checkpoint=init_path,
-        model=self.get_model_config(num_classes=2),
-        train_data=self._train_data_config,
-        init_cls_pooler=init_cls_pooler)
-    task = sentence_prediction.SentencePredictionTask(config)
-    model = task.build_model()
-    metrics = task.build_metrics()
-    dataset = task.build_inputs(config.train_data)
-
-    iterator = iter(dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.initialize(model)
-    task.train_step(next(iterator), model, optimizer, metrics=metrics)
-    task.validation_step(next(iterator), model, metrics=metrics)
-
-  @parameterized.named_parameters(
-      {
-          "testcase_name": "regression",
-          "num_classes": 1,
-      },
-      {
-          "testcase_name": "classification",
-          "num_classes": 2,
-      },
-  )
-  def test_metrics_and_losses(self, num_classes):
-    config = sentence_prediction.SentencePredictionConfig(
-        init_checkpoint=self.get_temp_dir(),
-        model=self.get_model_config(num_classes),
-        train_data=self._train_data_config)
-    task = sentence_prediction.SentencePredictionTask(config)
-    model = task.build_model()
-    metrics = task.build_metrics()
-    if num_classes == 1:
-      self.assertIsInstance(metrics[0], tf.keras.metrics.MeanSquaredError)
-    else:
-      self.assertIsInstance(metrics[0],
-                            tf.keras.metrics.SparseCategoricalAccuracy)
-
-    dataset = task.build_inputs(config.train_data)
-    iterator = iter(dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(iterator), model, optimizer, metrics=metrics)
-
-    logs = task.validation_step(next(iterator), model, metrics=metrics)
-    loss = logs["loss"].numpy()
-    if num_classes == 1:
-      self.assertGreater(loss, 1.0)
-    else:
-      self.assertLess(loss, 1.0)
-
-  @parameterized.parameters(("matthews_corrcoef", 2),
-                            ("pearson_spearman_corr", 1))
-  def test_np_metrics(self, metric_type, num_classes):
-    config = sentence_prediction.SentencePredictionConfig(
-        metric_type=metric_type,
-        init_checkpoint=self.get_temp_dir(),
-        model=self.get_model_config(num_classes),
-        train_data=self._train_data_config)
-    task = sentence_prediction.SentencePredictionTask(config)
-    model = task.build_model()
-    dataset = task.build_inputs(config.train_data)
-
-    iterator = iter(dataset)
-    strategy = tf.distribute.get_strategy()
-    distributed_outputs = strategy.run(
-        functools.partial(task.validation_step, model=model),
-        args=(next(iterator),))
-    outputs = tf.nest.map_structure(strategy.experimental_local_results,
-                                    distributed_outputs)
-    aggregated = task.aggregate_logs(step_outputs=outputs)
-    aggregated = task.aggregate_logs(state=aggregated, step_outputs=outputs)
-    self.assertIn(metric_type, task.reduce_aggregated_logs(aggregated))
-
-  def test_np_metrics_cola_partial_batch(self):
-    train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record")
-    num_examples = 5
-    global_batch_size = 8
-    seq_length = 16
-    _create_fake_dataset(
-        train_data_path,
-        seq_length=seq_length,
-        num_classes=2,
-        num_examples=num_examples)
-
-    train_data_config = (
-        sentence_prediction_dataloader.SentencePredictionDataConfig(
-            input_path=train_data_path,
-            seq_length=seq_length,
-            is_training=True,
-            label_type="int",
-            global_batch_size=global_batch_size,
-            drop_remainder=False,
-            include_example_id=True))
-
-    config = sentence_prediction.SentencePredictionConfig(
-        metric_type="matthews_corrcoef",
-        model=self.get_model_config(2),
-        train_data=train_data_config)
-    outputs = self._run_task(config)
-    self.assertEqual(outputs["sentence_prediction"].shape.as_list(), [8, 1])
-
-  def _export_bert_tfhub(self):
-    encoder = encoders.build_encoder(
-        encoders.EncoderConfig(
-            bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)))
-    encoder_inputs_dict = {x.name: x for x in encoder.inputs}
-    encoder_output_dict = encoder(encoder_inputs_dict)
-    core_model = tf.keras.Model(
-        inputs=encoder_inputs_dict, outputs=encoder_output_dict)
-    hub_destination = os.path.join(self.get_temp_dir(), "hub")
-    core_model.save(hub_destination, include_optimizer=False, save_format="tf")
-    return hub_destination
-
-  def test_task_with_hub(self):
-    hub_module_url = self._export_bert_tfhub()
-    config = sentence_prediction.SentencePredictionConfig(
-        hub_module_url=hub_module_url,
-        model=self.get_model_config(2),
-        train_data=self._train_data_config)
-    self._run_task(config)
-
-  @parameterized.named_parameters(("classification", 5), ("regression", 1))
-  def test_prediction(self, num_classes):
-    task_config = sentence_prediction.SentencePredictionConfig(
-        model=self.get_model_config(num_classes=num_classes),
-        train_data=self._train_data_config)
-    task = sentence_prediction.SentencePredictionTask(task_config)
-    model = task.build_model()
-
-    test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record")
-    seq_length = 16
-    num_examples = 100
-    _create_fake_dataset(
-        test_data_path,
-        seq_length=seq_length,
-        num_classes=num_classes,
-        num_examples=num_examples)
-
-    test_data_config = (
-        sentence_prediction_dataloader.SentencePredictionDataConfig(
-            input_path=test_data_path,
-            seq_length=seq_length,
-            is_training=False,
-            label_type="int" if num_classes > 1 else "float",
-            global_batch_size=16,
-            drop_remainder=False,
-            include_example_id=True))
-
-    predictions = sentence_prediction.predict(task, test_data_config, model)
-    self.assertLen(predictions, num_examples)
-    for prediction in predictions:
-      self.assertEqual(prediction.dtype,
-                       tf.int64 if num_classes > 1 else tf.float32)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/tagging.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/tagging.py
deleted file mode 100644
index 0e283f7bd56e15ba5ef86d420d867ce7b53ddf33..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/tagging.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tagging (e.g., NER/POS) task."""
-from typing import List, Optional, Tuple
-
-import dataclasses
-import orbit
-
-from seqeval import metrics as seqeval_metrics
-
-import tensorflow as tf
-
-from official.core import base_task
-from official.core import config_definitions as cfg
-from official.core import task_factory
-from official.modeling.hyperparams import base_config
-from official.nlp.configs import encoders
-from official.nlp.data import data_loader_factory
-from official.nlp.modeling import models
-from official.nlp.tasks import utils
-
-
-@dataclasses.dataclass
-class ModelConfig(base_config.Config):
-  """A base span labeler configuration."""
-  encoder: encoders.EncoderConfig = encoders.EncoderConfig()
-  head_dropout: float = 0.1
-  head_initializer_range: float = 0.02
-
-
-@dataclasses.dataclass
-class TaggingConfig(cfg.TaskConfig):
-  """The model config."""
-  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
-  init_checkpoint: str = ''
-  hub_module_url: str = ''
-  model: ModelConfig = ModelConfig()
-
-  # The real class names, the order of which should match real label id.
-  # Note that a word may be tokenized into multiple word_pieces tokens, and
-  # we asssume the real label id (non-negative) is assigned to the first token
-  # of the word, and a negative label id is assigned to the remaining tokens.
-  # The negative label id will not contribute to loss and metrics.
-  class_names: Optional[List[str]] = None
-  train_data: cfg.DataConfig = cfg.DataConfig()
-  validation_data: cfg.DataConfig = cfg.DataConfig()
-
-
-def _masked_labels_and_weights(y_true):
-  """Masks negative values from token level labels.
-
-  Args:
-    y_true: Token labels, typically shape (batch_size, seq_len), where tokens
-      with negative labels should be ignored during loss/accuracy calculation.
-
-  Returns:
-    (masked_y_true, masked_weights) where `masked_y_true` is the input
-    with each negative label replaced with zero and `masked_weights` is 0.0
-    where negative labels were replaced and 1.0 for original labels.
-  """
-  # Ignore the classes of tokens with negative values.
-  mask = tf.greater_equal(y_true, 0)
-  # Replace negative labels, which are out of bounds for some loss functions,
-  # with zero.
-  masked_y_true = tf.where(mask, y_true, 0)
-  return masked_y_true, tf.cast(mask, tf.float32)
-
-
-@task_factory.register_task_cls(TaggingConfig)
-class TaggingTask(base_task.Task):
-  """Task object for tagging (e.g., NER or POS)."""
-
-  def build_model(self):
-    if self.task_config.hub_module_url and self.task_config.init_checkpoint:
-      raise ValueError('At most one of `hub_module_url` and '
-                       '`init_checkpoint` can be specified.')
-    if self.task_config.hub_module_url:
-      encoder_network = utils.get_encoder_from_hub(
-          self.task_config.hub_module_url)
-    else:
-      encoder_network = encoders.build_encoder(self.task_config.model.encoder)
-
-    return models.BertTokenClassifier(
-        network=encoder_network,
-        num_classes=len(self.task_config.class_names),
-        initializer=tf.keras.initializers.TruncatedNormal(
-            stddev=self.task_config.model.head_initializer_range),
-        dropout_rate=self.task_config.model.head_dropout,
-        output='logits',
-        output_encoder_outputs=True)
-
-  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    logits = tf.cast(model_outputs['logits'], tf.float32)
-    masked_labels, masked_weights = _masked_labels_and_weights(labels)
-    loss = tf.keras.losses.sparse_categorical_crossentropy(
-        masked_labels, logits, from_logits=True)
-    numerator_loss = tf.reduce_sum(loss * masked_weights)
-    denominator_loss = tf.reduce_sum(masked_weights)
-    loss = tf.math.divide_no_nan(numerator_loss, denominator_loss)
-    return loss
-
-  def build_inputs(self, params: cfg.DataConfig, input_context=None):
-    """Returns tf.data.Dataset for sentence_prediction task."""
-    if params.input_path == 'dummy':
-
-      def dummy_data(_):
-        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
-        x = dict(
-            input_word_ids=dummy_ids,
-            input_mask=dummy_ids,
-            input_type_ids=dummy_ids)
-
-        # Include some label_id as -1, which will be ignored in loss/metrics.
-        y = tf.random.uniform(
-            shape=(1, params.seq_length),
-            minval=-1,
-            maxval=len(self.task_config.class_names),
-            dtype=tf.dtypes.int32)
-        return (x, y)
-
-      dataset = tf.data.Dataset.range(1)
-      dataset = dataset.repeat()
-      dataset = dataset.map(
-          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-      return dataset
-
-    return data_loader_factory.get_data_loader(params).load(input_context)
-
-  def inference_step(self, inputs, model: tf.keras.Model):
-    """Performs the forward step."""
-    logits = model(inputs, training=False)['logits']
-    return {'logits': logits,
-            'predict_ids': tf.argmax(logits, axis=-1, output_type=tf.int32)}
-
-  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
-    """Validatation step.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the keras.Model.
-      metrics: a nested structure of metrics objects.
-
-    Returns:
-      A dictionary of logs.
-    """
-    features, labels = inputs
-    outputs = self.inference_step(features, model)
-    loss = self.build_losses(labels=labels, model_outputs=outputs)
-
-    # Negative label ids are padding labels which should be ignored.
-    real_label_index = tf.where(tf.greater_equal(labels, 0))
-    predict_ids = tf.gather_nd(outputs['predict_ids'], real_label_index)
-    label_ids = tf.gather_nd(labels, real_label_index)
-    return {
-        self.loss: loss,
-        'predict_ids': predict_ids,
-        'label_ids': label_ids,
-    }
-
-  def aggregate_logs(self, state=None, step_outputs=None):
-    """Aggregates over logs returned from a validation step."""
-    if state is None:
-      state = {'predict_class': [], 'label_class': []}
-
-    def id_to_class_name(batched_ids):
-      class_names = []
-      for per_example_ids in batched_ids:
-        class_names.append([])
-        for per_token_id in per_example_ids.numpy().tolist():
-          class_names[-1].append(self.task_config.class_names[per_token_id])
-
-      return class_names
-
-    # Convert id to class names, because `seqeval_metrics` relies on the class
-    # name to decide IOB tags.
-    state['predict_class'].extend(id_to_class_name(step_outputs['predict_ids']))
-    state['label_class'].extend(id_to_class_name(step_outputs['label_ids']))
-    return state
-
-  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
-    """Reduces aggregated logs over validation steps."""
-    label_class = aggregated_logs['label_class']
-    predict_class = aggregated_logs['predict_class']
-    return {
-        'f1':
-            seqeval_metrics.f1_score(label_class, predict_class),
-        'precision':
-            seqeval_metrics.precision_score(label_class, predict_class),
-        'recall':
-            seqeval_metrics.recall_score(label_class, predict_class),
-        'accuracy':
-            seqeval_metrics.accuracy_score(label_class, predict_class),
-    }
-
-
-def predict(task: TaggingTask,
-            params: cfg.DataConfig,
-            model: tf.keras.Model) -> List[Tuple[int, int, List[int]]]:
-  """Predicts on the input data.
-
-  Args:
-    task: A `TaggingTask` object.
-    params: A `cfg.DataConfig` object.
-    model: A keras.Model.
-
-  Returns:
-    A list of tuple. Each tuple contains `sentence_id`, `sub_sentence_id` and
-      a list of predicted ids.
-  """
-
-  def predict_step(inputs):
-    """Replicated prediction calculation."""
-    x, y = inputs
-    sentence_ids = x.pop('sentence_id')
-    sub_sentence_ids = x.pop('sub_sentence_id')
-    outputs = task.inference_step(x, model)
-    predict_ids = outputs['predict_ids']
-    label_mask = tf.greater_equal(y, 0)
-    return dict(
-        predict_ids=predict_ids,
-        label_mask=label_mask,
-        sentence_ids=sentence_ids,
-        sub_sentence_ids=sub_sentence_ids)
-
-  def aggregate_fn(state, outputs):
-    """Concatenates model's outputs."""
-    if state is None:
-      state = []
-
-    for (batch_predict_ids, batch_label_mask, batch_sentence_ids,
-         batch_sub_sentence_ids) in zip(outputs['predict_ids'],
-                                        outputs['label_mask'],
-                                        outputs['sentence_ids'],
-                                        outputs['sub_sentence_ids']):
-      for (tmp_predict_ids, tmp_label_mask, tmp_sentence_id,
-           tmp_sub_sentence_id) in zip(batch_predict_ids.numpy(),
-                                       batch_label_mask.numpy(),
-                                       batch_sentence_ids.numpy(),
-                                       batch_sub_sentence_ids.numpy()):
-        real_predict_ids = []
-        assert len(tmp_predict_ids) == len(tmp_label_mask)
-        for i in range(len(tmp_predict_ids)):
-          # Skip the padding label.
-          if tmp_label_mask[i]:
-            real_predict_ids.append(tmp_predict_ids[i])
-        state.append((tmp_sentence_id, tmp_sub_sentence_id, real_predict_ids))
-
-    return state
-
-  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
-                                                 task.build_inputs, params)
-  outputs = utils.predict(predict_step, aggregate_fn, dataset)
-  return sorted(outputs, key=lambda x: (x[0], x[1]))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/tagging_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/tagging_test.py
deleted file mode 100644
index b6d5e8ea62fb31ebe8b40dd7ec694186ed504273..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/tagging_test.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.tasks.tagging."""
-import functools
-import os
-
-import numpy as np
-import tensorflow as tf
-
-from official.nlp.configs import encoders
-from official.nlp.data import tagging_dataloader
-from official.nlp.tasks import tagging
-
-
-def _create_fake_dataset(output_path, seq_length, num_labels, num_examples):
-  """Creates a fake dataset."""
-  writer = tf.io.TFRecordWriter(output_path)
-
-  def create_int_feature(values):
-    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-    return f
-
-  for i in range(num_examples):
-    features = {}
-    input_ids = np.random.randint(100, size=(seq_length))
-    features["input_ids"] = create_int_feature(input_ids)
-    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
-    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
-    features["label_ids"] = create_int_feature(
-        np.random.random_integers(-1, num_labels - 1, size=(seq_length)))
-    features["sentence_id"] = create_int_feature([i])
-    features["sub_sentence_id"] = create_int_feature([0])
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-
-
-class TaggingTest(tf.test.TestCase):
-
-  def setUp(self):
-    super(TaggingTest, self).setUp()
-    self._encoder_config = encoders.EncoderConfig(
-        bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1))
-    self._train_data_config = tagging_dataloader.TaggingDataConfig(
-        input_path="dummy", seq_length=128, global_batch_size=1)
-
-  def _run_task(self, config):
-    task = tagging.TaggingTask(config)
-    model = task.build_model()
-    metrics = task.build_metrics()
-
-    strategy = tf.distribute.get_strategy()
-    dataset = strategy.distribute_datasets_from_function(
-        functools.partial(task.build_inputs, config.train_data))
-
-    iterator = iter(dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(iterator), model, optimizer, metrics=metrics)
-    task.validation_step(next(iterator), model, metrics=metrics)
-    model.save(os.path.join(self.get_temp_dir(), "saved_model"))
-
-  def test_task(self):
-    # Saves a checkpoint.
-    encoder = encoders.build_encoder(self._encoder_config)
-    ckpt = tf.train.Checkpoint(encoder=encoder)
-    saved_path = ckpt.save(self.get_temp_dir())
-
-    config = tagging.TaggingConfig(
-        init_checkpoint=saved_path,
-        model=tagging.ModelConfig(encoder=self._encoder_config),
-        train_data=self._train_data_config,
-        class_names=["O", "B-PER", "I-PER"])
-    task = tagging.TaggingTask(config)
-    model = task.build_model()
-    metrics = task.build_metrics()
-    dataset = task.build_inputs(config.train_data)
-
-    iterator = iter(dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(iterator), model, optimizer, metrics=metrics)
-    task.validation_step(next(iterator), model, metrics=metrics)
-    task.initialize(model)
-
-  def _export_bert_tfhub(self):
-    encoder = encoders.build_encoder(
-        encoders.EncoderConfig(
-            bert=encoders.BertEncoderConfig(vocab_size=30522, num_layers=1)))
-    encoder_inputs_dict = {x.name: x for x in encoder.inputs}
-    encoder_output_dict = encoder(encoder_inputs_dict)
-    core_model = tf.keras.Model(
-        inputs=encoder_inputs_dict, outputs=encoder_output_dict)
-    hub_destination = os.path.join(self.get_temp_dir(), "hub")
-    core_model.save(hub_destination, include_optimizer=False, save_format="tf")
-    return hub_destination
-
-  def test_task_with_hub(self):
-    hub_module_url = self._export_bert_tfhub()
-    config = tagging.TaggingConfig(
-        hub_module_url=hub_module_url,
-        class_names=["O", "B-PER", "I-PER"],
-        train_data=self._train_data_config)
-    self._run_task(config)
-
-  def test_seqeval_metrics(self):
-    config = tagging.TaggingConfig(
-        model=tagging.ModelConfig(encoder=self._encoder_config),
-        train_data=self._train_data_config,
-        class_names=["O", "B-PER", "I-PER"])
-    task = tagging.TaggingTask(config)
-    model = task.build_model()
-    dataset = task.build_inputs(config.train_data)
-
-    iterator = iter(dataset)
-    strategy = tf.distribute.get_strategy()
-    distributed_outputs = strategy.run(
-        functools.partial(task.validation_step, model=model),
-        args=(next(iterator),))
-    outputs = tf.nest.map_structure(strategy.experimental_local_results,
-                                    distributed_outputs)
-    aggregated = task.aggregate_logs(step_outputs=outputs)
-    aggregated = task.aggregate_logs(state=aggregated, step_outputs=outputs)
-    self.assertCountEqual({"f1", "precision", "recall", "accuracy"},
-                          task.reduce_aggregated_logs(aggregated).keys())
-
-  def test_predict(self):
-    task_config = tagging.TaggingConfig(
-        model=tagging.ModelConfig(encoder=self._encoder_config),
-        train_data=self._train_data_config,
-        class_names=["O", "B-PER", "I-PER"])
-    task = tagging.TaggingTask(task_config)
-    model = task.build_model()
-
-    test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record")
-    seq_length = 16
-    num_examples = 100
-    _create_fake_dataset(
-        test_data_path,
-        seq_length=seq_length,
-        num_labels=len(task_config.class_names),
-        num_examples=num_examples)
-    test_data_config = tagging_dataloader.TaggingDataConfig(
-        input_path=test_data_path,
-        seq_length=seq_length,
-        is_training=False,
-        global_batch_size=16,
-        drop_remainder=False,
-        include_sentence_id=True)
-
-    results = tagging.predict(task, test_data_config, model)
-    self.assertLen(results, num_examples)
-    self.assertLen(results[0], 3)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/translation.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/translation.py
deleted file mode 100644
index 5d76b606d9731f92ce40961913cab5ec80443d87..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/translation.py
+++ /dev/null
@@ -1,383 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Defines the translation task."""
-import os
-from typing import Optional
-
-from absl import logging
-import dataclasses
-import sacrebleu
-import tensorflow as tf
-import tensorflow_text as tftxt
-
-from official.core import base_task
-from official.core import config_definitions as cfg
-from official.core import task_factory
-from official.modeling.hyperparams import base_config
-from official.nlp.data import data_loader_factory
-from official.nlp.metrics import bleu
-from official.nlp.modeling import models
-
-
-def _pad_tensors_to_same_length(x, y):
-  """Pad x and y so that the results have the same length (second dimension)."""
-  x_length = tf.shape(x)[1]
-  y_length = tf.shape(y)[1]
-
-  max_length = tf.maximum(x_length, y_length)
-
-  x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
-  y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
-  return x, y
-
-
-def _padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
-  """Calculate cross entropy loss while ignoring padding.
-
-  Args:
-    logits: Tensor of size [batch_size, length_logits, vocab_size]
-    labels: Tensor of size [batch_size, length_labels]
-    smoothing: Label smoothing constant, used to determine the on and off values
-    vocab_size: int size of the vocabulary
-
-  Returns:
-    Returns the cross entropy loss and weight tensors: float32 tensors with
-      shape [batch_size, max(length_logits, length_labels)]
-  """
-  logits, labels = _pad_tensors_to_same_length(logits, labels)
-
-  # Calculate smoothing cross entropy
-  confidence = 1.0 - smoothing
-  low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
-  soft_targets = tf.one_hot(
-      tf.cast(labels, tf.int32),
-      depth=vocab_size,
-      on_value=confidence,
-      off_value=low_confidence)
-  xentropy = tf.nn.softmax_cross_entropy_with_logits(
-      logits=logits, labels=soft_targets)
-
-  # Calculate the best (lowest) possible value of cross entropy, and
-  # subtract from the cross entropy loss.
-  normalizing_constant = -(
-      confidence * tf.math.log(confidence) + tf.cast(vocab_size - 1, tf.float32)
-      * low_confidence * tf.math.log(low_confidence + 1e-20))
-  xentropy -= normalizing_constant
-
-  weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
-  return xentropy * weights, weights
-
-
-@dataclasses.dataclass
-class EncDecoder(base_config.Config):
-  """Configurations for Encoder/Decoder."""
-  num_layers: int = 6
-  num_attention_heads: int = 8
-  intermediate_size: int = 2048
-  activation: str = "relu"
-  dropout_rate: float = 0.1
-  attention_dropout_rate: float = 0.1
-  intermediate_dropout: float = 0.1
-  use_bias: bool = False
-  norm_first: bool = True
-  norm_epsilon: float = 1e-6
-
-
-@dataclasses.dataclass
-class ModelConfig(base_config.Config):
-  """A base Seq2Seq model configuration."""
-  encoder: EncDecoder = EncDecoder()
-  decoder: EncDecoder = EncDecoder()
-
-  embedding_width: int = 512
-  dropout_rate: float = 0.1
-
-  # Decoding.
-  padded_decode: bool = False
-  decode_max_length: Optional[int] = None
-  beam_size: int = 4
-  alpha: float = 0.6
-
-  # Training.
-  label_smoothing: float = 0.1
-
-
-@dataclasses.dataclass
-class TranslationConfig(cfg.TaskConfig):
-  """The translation task config."""
-  model: ModelConfig = ModelConfig()
-  train_data: cfg.DataConfig = cfg.DataConfig()
-  validation_data: cfg.DataConfig = cfg.DataConfig()
-  # Tokenization
-  sentencepiece_model_path: str = ""
-  # Evaluation.
-  print_translations: Optional[bool] = None
-
-
-def write_test_record(params, model_dir):
-  """Writes the test input to a tfrecord."""
-  # Get raw data from tfds.
-  params = params.replace(transform_and_batch=False)
-  dataset = data_loader_factory.get_data_loader(params).load()
-  references = []
-  total_samples = 0
-  output_file = os.path.join(model_dir, "eval.tf_record")
-  writer = tf.io.TFRecordWriter(output_file)
-  for d in dataset:
-    references.append(d[params.tgt_lang].numpy().decode())
-    example = tf.train.Example(
-        features=tf.train.Features(
-            feature={
-                "unique_id": tf.train.Feature(
-                    int64_list=tf.train.Int64List(value=[total_samples])),
-                params.src_lang: tf.train.Feature(
-                    bytes_list=tf.train.BytesList(
-                        value=[d[params.src_lang].numpy()])),
-                params.tgt_lang: tf.train.Feature(
-                    bytes_list=tf.train.BytesList(
-                        value=[d[params.tgt_lang].numpy()])),
-            }))
-    writer.write(example.SerializeToString())
-    total_samples += 1
-  batch_size = params.global_batch_size
-  num_dummy_example = batch_size - total_samples % batch_size
-  for i in range(num_dummy_example):
-    example = tf.train.Example(
-        features=tf.train.Features(
-            feature={
-                "unique_id": tf.train.Feature(
-                    int64_list=tf.train.Int64List(value=[total_samples + i])),
-                params.src_lang: tf.train.Feature(
-                    bytes_list=tf.train.BytesList(value=[b""])),
-                params.tgt_lang: tf.train.Feature(
-                    bytes_list=tf.train.BytesList(value=[b""])),
-            }))
-    writer.write(example.SerializeToString())
-  writer.close()
-  return references, output_file
-
-
-@task_factory.register_task_cls(TranslationConfig)
-class TranslationTask(base_task.Task):
-  """A single-replica view of training procedure.
-
-  Tasks provide artifacts for training/evalution procedures, including
-  loading/iterating over Datasets, initializing the model, calculating the loss
-  and customized metrics with reduction.
-  """
-
-  def __init__(self, params: cfg.TaskConfig, logging_dir=None, name=None):
-    super().__init__(params, logging_dir, name=name)
-    self._sentencepiece_model_path = params.sentencepiece_model_path
-    if params.sentencepiece_model_path:
-      self._sp_tokenizer = tftxt.SentencepieceTokenizer(
-          model=tf.io.gfile.GFile(params.sentencepiece_model_path, "rb").read(),
-          add_eos=True)
-      try:
-        empty_str_tokenized = self._sp_tokenizer.tokenize("").numpy()
-      except tf.errors.InternalError:
-        raise ValueError(
-            "EOS token not in tokenizer vocab."
-            "Please make sure the tokenizer generates a single token for an "
-            "empty string.")
-      self._eos_id = empty_str_tokenized.item()
-      self._vocab_size = self._sp_tokenizer.vocab_size().numpy()
-    else:
-      raise ValueError("Setencepiece model path not provided.")
-    if (params.validation_data.input_path or
-        params.validation_data.tfds_name) and self._logging_dir:
-      self._references, self._tf_record_input_path = write_test_record(
-          params.validation_data, self.logging_dir)
-
-  def build_model(self) -> tf.keras.Model:
-    """Creates model architecture.
-
-    Returns:
-      A model instance.
-    """
-    model_cfg = self.task_config.model
-    encoder_kwargs = model_cfg.encoder.as_dict()
-    encoder_layer = models.TransformerEncoder(**encoder_kwargs)
-    decoder_kwargs = model_cfg.decoder.as_dict()
-    decoder_layer = models.TransformerDecoder(**decoder_kwargs)
-
-    return models.Seq2SeqTransformer(
-        vocab_size=self._vocab_size,
-        embedding_width=model_cfg.embedding_width,
-        dropout_rate=model_cfg.dropout_rate,
-        padded_decode=model_cfg.padded_decode,
-        decode_max_length=model_cfg.decode_max_length,
-        beam_size=model_cfg.beam_size,
-        alpha=model_cfg.alpha,
-        encoder_layer=encoder_layer,
-        decoder_layer=decoder_layer,
-        eos_id=self._eos_id)
-
-  def build_inputs(self,
-                   params: cfg.DataConfig,
-                   input_context: Optional[tf.distribute.InputContext] = None):
-    """Returns a dataset."""
-    if params.is_training:
-      dataloader_params = params
-    else:
-      input_path = self._tf_record_input_path
-      # Read from padded tf records instead.
-      dataloader_params = params.replace(
-          input_path=input_path,
-          tfds_name="",
-          tfds_split="",
-          has_unique_id=True)
-    dataloader_params = dataloader_params.replace(
-        sentencepiece_model_path=self._sentencepiece_model_path)
-    return data_loader_factory.get_data_loader(dataloader_params).load(
-        input_context)
-
-  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    """Standard interface to compute losses.
-
-    Args:
-      labels: optional label tensors.
-      model_outputs: a nested structure of output tensors.
-      aux_losses: auxiliary loss tensors, i.e. `losses` in keras.Model.
-
-    Returns:
-      The total loss tensor.
-    """
-    del aux_losses
-
-    smoothing = self.task_config.model.label_smoothing
-    xentropy, weights = _padded_cross_entropy_loss(model_outputs, labels,
-                                                   smoothing, self._vocab_size)
-    return tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
-
-  def train_step(self,
-                 inputs,
-                 model: tf.keras.Model,
-                 optimizer: tf.keras.optimizers.Optimizer,
-                 metrics=None):
-    """Does forward and backward.
-
-    With distribution strategies, this method runs on devices.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the model, forward pass definition.
-      optimizer: the optimizer for this training step.
-      metrics: a nested structure of metrics objects.
-
-    Returns:
-      A dictionary of logs.
-    """
-    with tf.GradientTape() as tape:
-      outputs = model(inputs, training=True)
-      # Computes per-replica loss.
-      loss = self.build_losses(labels=inputs["targets"], model_outputs=outputs)
-      # Scales loss as the default gradients allreduce performs sum inside the
-      # optimizer.
-      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
-
-      # For mixed precision, when a LossScaleOptimizer is used, the loss is
-      # scaled to avoid numeric underflow.
-      if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
-        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
-
-    tvars = model.trainable_variables
-    grads = tape.gradient(scaled_loss, tvars)
-
-    if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
-      grads = optimizer.get_unscaled_gradients(grads)
-    optimizer.apply_gradients(list(zip(grads, tvars)))
-    logs = {self.loss: loss}
-    if metrics:
-      self.process_metrics(metrics, inputs["targets"], outputs)
-    return logs
-
-  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
-    unique_ids = inputs.pop("unique_id")
-    # Validation loss
-    outputs = model(inputs, training=False)
-    # Computes per-replica loss to help understand if we are overfitting.
-    loss = self.build_losses(labels=inputs["targets"], model_outputs=outputs)
-    inputs.pop("targets")
-    # Beam search to calculate metrics.
-    model_outputs = model(inputs, training=False)
-    outputs = model_outputs
-    logs = {
-        self.loss: loss,
-        "inputs": inputs["inputs"],
-        "unique_ids": unique_ids,
-    }
-    logs.update(outputs)
-    return logs
-
-  def aggregate_logs(self, state=None, step_outputs=None):
-    """Aggregates over logs returned from a validation step."""
-    if state is None:
-      state = {}
-
-    for in_token_ids, out_token_ids, unique_ids in zip(
-        step_outputs["inputs"],
-        step_outputs["outputs"],
-        step_outputs["unique_ids"]):
-      for in_ids, out_ids, u_id in zip(
-          in_token_ids.numpy(), out_token_ids.numpy(), unique_ids.numpy()):
-        state[u_id] = (in_ids, out_ids)
-    return state
-
-  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
-
-    def _decode(ids):
-      return self._sp_tokenizer.detokenize(ids).numpy().decode()
-
-    def _trim_and_decode(ids):
-      """Trim EOS and PAD tokens from ids, and decode to return a string."""
-      try:
-        index = list(ids).index(self._eos_id)
-        return _decode(ids[:index])
-      except ValueError:  # No EOS found in sequence
-        return _decode(ids)
-
-    translations = []
-    for u_id in sorted(aggregated_logs):
-      if u_id >= len(self._references):
-        continue
-      src = _trim_and_decode(aggregated_logs[u_id][0])
-      translation = _trim_and_decode(aggregated_logs[u_id][1])
-      translations.append(translation)
-      if self.task_config.print_translations:
-        # Deccoding the in_ids to reflect what the model sees.
-        logging.info("Translating:\n\tInput: %s\n\tOutput: %s\n\tReference: %s",
-                     src, translation, self._references[u_id])
-    sacrebleu_score = sacrebleu.corpus_bleu(
-        translations, [self._references]).score
-    bleu_score = bleu.bleu_on_list(self._references, translations)
-    return {"sacrebleu_score": sacrebleu_score,
-            "bleu_score": bleu_score}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/translation_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/translation_test.py
deleted file mode 100644
index f9a0028a3d32cae5a0de39f13bfcd9d312027d2c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/translation_test.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for official.nlp.tasks.translation."""
-import functools
-import os
-
-import orbit
-import tensorflow as tf
-
-from sentencepiece import SentencePieceTrainer
-from official.nlp.data import wmt_dataloader
-from official.nlp.tasks import translation
-
-
-def _generate_line_file(filepath, lines):
-  with tf.io.gfile.GFile(filepath, "w") as f:
-    for l in lines:
-      f.write("{}\n".format(l))
-
-
-def _generate_record_file(filepath, src_lines, tgt_lines):
-  writer = tf.io.TFRecordWriter(filepath)
-  for src, tgt in zip(src_lines, tgt_lines):
-    example = tf.train.Example(
-        features=tf.train.Features(
-            feature={
-                "en": tf.train.Feature(
-                    bytes_list=tf.train.BytesList(
-                        value=[src.encode()])),
-                "reverse_en": tf.train.Feature(
-                    bytes_list=tf.train.BytesList(
-                        value=[tgt.encode()])),
-            }))
-    writer.write(example.SerializeToString())
-  writer.close()
-
-
-def _train_sentencepiece(input_path, vocab_size, model_path, eos_id=1):
-  argstr = " ".join([
-      f"--input={input_path}", f"--vocab_size={vocab_size}",
-      "--character_coverage=0.995",
-      f"--model_prefix={model_path}", "--model_type=bpe",
-      "--bos_id=-1", "--pad_id=0", f"--eos_id={eos_id}", "--unk_id=2"
-  ])
-  SentencePieceTrainer.Train(argstr)
-
-
-class TranslationTaskTest(tf.test.TestCase):
-
-  def setUp(self):
-    super(TranslationTaskTest, self).setUp()
-    self._temp_dir = self.get_temp_dir()
-    src_lines = [
-        "abc ede fg",
-        "bbcd ef a g",
-        "de f a a g"
-    ]
-    tgt_lines = [
-        "dd cc a ef  g",
-        "bcd ef a g",
-        "gef cd ba"
-    ]
-    self._record_input_path = os.path.join(self._temp_dir, "inputs.record")
-    _generate_record_file(self._record_input_path, src_lines, tgt_lines)
-    self._sentencepeice_input_path = os.path.join(self._temp_dir, "inputs.txt")
-    _generate_line_file(self._sentencepeice_input_path, src_lines + tgt_lines)
-    sentencepeice_model_prefix = os.path.join(self._temp_dir, "sp")
-    _train_sentencepiece(self._sentencepeice_input_path, 11,
-                         sentencepeice_model_prefix)
-    self._sentencepeice_model_path = "{}.model".format(
-        sentencepeice_model_prefix)
-
-  def test_task(self):
-    config = translation.TranslationConfig(
-        model=translation.ModelConfig(
-            encoder=translation.EncDecoder(), decoder=translation.EncDecoder()),
-        train_data=wmt_dataloader.WMTDataConfig(
-            input_path=self._record_input_path,
-            src_lang="en", tgt_lang="reverse_en",
-            is_training=True, static_batch=True, global_batch_size=24,
-            max_seq_length=12),
-        sentencepiece_model_path=self._sentencepeice_model_path)
-    task = translation.TranslationTask(config)
-    model = task.build_model()
-    dataset = task.build_inputs(config.train_data)
-    iterator = iter(dataset)
-    optimizer = tf.keras.optimizers.SGD(lr=0.1)
-    task.train_step(next(iterator), model, optimizer)
-
-  def test_no_sentencepiece_path(self):
-    config = translation.TranslationConfig(
-        model=translation.ModelConfig(
-            encoder=translation.EncDecoder(), decoder=translation.EncDecoder()),
-        train_data=wmt_dataloader.WMTDataConfig(
-            input_path=self._record_input_path,
-            src_lang="en", tgt_lang="reverse_en",
-            is_training=True, static_batch=True, global_batch_size=4,
-            max_seq_length=4),
-        sentencepiece_model_path=None)
-    with self.assertRaisesRegex(
-        ValueError,
-        "Setencepiece model path not provided."):
-      translation.TranslationTask(config)
-
-  def test_sentencepiece_no_eos(self):
-    sentencepeice_model_prefix = os.path.join(self._temp_dir, "sp_no_eos")
-    _train_sentencepiece(self._sentencepeice_input_path, 20,
-                         sentencepeice_model_prefix, eos_id=-1)
-    sentencepeice_model_path = "{}.model".format(
-        sentencepeice_model_prefix)
-    config = translation.TranslationConfig(
-        model=translation.ModelConfig(
-            encoder=translation.EncDecoder(), decoder=translation.EncDecoder()),
-        train_data=wmt_dataloader.WMTDataConfig(
-            input_path=self._record_input_path,
-            src_lang="en", tgt_lang="reverse_en",
-            is_training=True, static_batch=True, global_batch_size=4,
-            max_seq_length=4),
-        sentencepiece_model_path=sentencepeice_model_path)
-    with self.assertRaisesRegex(
-        ValueError,
-        "EOS token not in tokenizer vocab.*"):
-      translation.TranslationTask(config)
-
-  def test_evaluation(self):
-    config = translation.TranslationConfig(
-        model=translation.ModelConfig(
-            encoder=translation.EncDecoder(), decoder=translation.EncDecoder(),
-            padded_decode=False,
-            decode_max_length=64),
-        validation_data=wmt_dataloader.WMTDataConfig(
-            input_path=self._record_input_path, src_lang="en",
-            tgt_lang="reverse_en", static_batch=True, global_batch_size=4),
-        sentencepiece_model_path=self._sentencepeice_model_path)
-    logging_dir = self.get_temp_dir()
-    task = translation.TranslationTask(config, logging_dir=logging_dir)
-    dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
-                                                   task.build_inputs,
-                                                   config.validation_data)
-    model = task.build_model()
-    strategy = tf.distribute.get_strategy()
-    aggregated = None
-    for data in dataset:
-      distributed_outputs = strategy.run(
-          functools.partial(task.validation_step, model=model),
-          args=(data,))
-      outputs = tf.nest.map_structure(strategy.experimental_local_results,
-                                      distributed_outputs)
-      aggregated = task.aggregate_logs(state=aggregated, step_outputs=outputs)
-    metrics = task.reduce_aggregated_logs(aggregated)
-    self.assertIn("sacrebleu_score", metrics)
-    self.assertIn("bleu_score", metrics)
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/utils.py
deleted file mode 100644
index f577bb51139eece6b68432ff27355de398178249..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tasks/utils.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Common utils for tasks."""
-from typing import Any, Callable
-
-import orbit
-import tensorflow as tf
-import tensorflow_hub as hub
-
-
-def get_encoder_from_hub(hub_model_path: str) -> tf.keras.Model:
-  """Gets an encoder from hub.
-
-  Args:
-    hub_model_path: The path to the tfhub model.
-
-  Returns:
-    A tf.keras.Model.
-  """
-  input_word_ids = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name='input_word_ids')
-  input_mask = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name='input_mask')
-  input_type_ids = tf.keras.layers.Input(
-      shape=(None,), dtype=tf.int32, name='input_type_ids')
-  hub_layer = hub.KerasLayer(hub_model_path, trainable=True)
-  output_dict = {}
-  dict_input = dict(
-      input_word_ids=input_word_ids,
-      input_mask=input_mask,
-      input_type_ids=input_type_ids)
-  output_dict = hub_layer(dict_input)
-
-  return tf.keras.Model(inputs=dict_input, outputs=output_dict)
-
-
-def predict(predict_step_fn: Callable[[Any], Any],
-            aggregate_fn: Callable[[Any, Any], Any], dataset: tf.data.Dataset):
-  """Runs prediction.
-
-  Args:
-    predict_step_fn: A callable such as `def predict_step(inputs)`, where
-      `inputs` are input tensors.
-    aggregate_fn: A callable such as `def aggregate_fn(state, value)`, where
-      `value` is the outputs from `predict_step_fn`.
-    dataset: A `tf.data.Dataset` object.
-
-  Returns:
-    The aggregated predictions.
-  """
-
-  @tf.function
-  def predict_step(iterator):
-    """Predicts on distributed devices."""
-    outputs = tf.distribute.get_strategy().run(
-        predict_step_fn, args=(next(iterator),))
-    return tf.nest.map_structure(
-        tf.distribute.get_strategy().experimental_local_results, outputs)
-
-  loop_fn = orbit.utils.create_loop_fn(predict_step)
-  # Set `num_steps` to -1 to exhaust the dataset.
-  outputs = loop_fn(
-      iter(dataset), num_steps=-1, state=None, reduce_fn=aggregate_fn)  # pytype: disable=wrong-arg-types
-  return outputs
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/__init__.py
deleted file mode 100644
index 3ef7bb85ba5f722a4f34e90623470d5a45af3aa4..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/export_tfhub.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/export_tfhub.py
deleted file mode 100644
index 8ac936fd3470f164c0ffeea795e2b21030754a8d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/export_tfhub.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-r"""Exports a BERT-like encoder and its preprocessing as SavedModels for TF Hub.
-
-This tool creates preprocessor and encoder SavedModels suitable for uploading
-to https://tfhub.dev that implement the preprocessor and encoder APIs defined
-at https://www.tensorflow.org/hub/common_saved_model_apis/text.
-
-For a full usage guide, see
-https://github.com/tensorflow/models/blob/master/official/nlp/docs/tfhub.md
-
-Minimal usage examples:
-
-1) Exporting an Encoder from checkpoint and config.
-
-```
-export_tfhub \
-  --encoder_config_file=${BERT_DIR:?}/bert_encoder.yaml \
-  --model_checkpoint_path=${BERT_DIR:?}/bert_model.ckpt \
-  --vocab_file=${BERT_DIR:?}/vocab.txt \
-  --export_type=model \
-  --export_path=/tmp/bert_model
-```
-
-An --encoder_config_file can specify encoder types other than BERT.
-For BERT, a --bert_config_file in the legacy JSON format can be passed instead.
-
-Flag --vocab_file (and flag --do_lower_case, whose default value is guessed
-from the vocab_file path) capture how BertTokenizer was used in pre-training.
-Use flag --sp_model_file instead if SentencepieceTokenizer was used.
-
-Changing --export_type to model_with_mlm additionally creates an `.mlm`
-subobject on the exported SavedModel that can be called to produce
-the logits of the Masked Language Model task from pretraining.
-The help string for flag --model_checkpoint_path explains the checkpoint
-formats required for each --export_type.
-
-
-2) Exporting a preprocessor SavedModel
-
-```
-export_tfhub \
-  --vocab_file ${BERT_DIR:?}/vocab.txt \
-  --export_type preprocessing --export_path /tmp/bert_preprocessing
-```
-
-Be sure to use flag values that match the encoder and how it has been
-pre-trained (see above for --vocab_file vs --sp_model_file).
-
-If your encoder has been trained with text preprocessing for which tfhub.dev
-already has SavedModel, you could guide your users to reuse that one instead
-of exporting and publishing your own.
-
-TODO(b/175369555): When exporting to users of TensorFlow 2.4, add flag
-`--experimental_disable_assert_in_preprocessing`.
-"""
-
-from absl import app
-from absl import flags
-import gin
-
-from official.modeling import hyperparams
-from official.nlp.bert import configs
-from official.nlp.configs import encoders
-from official.nlp.tools import export_tfhub_lib
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_enum(
-    "export_type", "model",
-    ["model", "model_with_mlm", "preprocessing"],
-    "The overall type of SavedModel to export. Flags "
-    "--bert_config_file/--encoder_config_file and --vocab_file/--sp_model_file "
-    "control which particular encoder model and preprocessing are exported.")
-flags.DEFINE_string(
-    "export_path", None,
-    "Directory to which the SavedModel is written.")
-flags.DEFINE_string(
-    "encoder_config_file", None,
-    "A yaml file representing `encoders.EncoderConfig` to define the encoder "
-    "(BERT or other). "
-    "Exactly one of --bert_config_file and --encoder_config_file can be set. "
-    "Needed for --export_type model and model_with_mlm.")
-flags.DEFINE_string(
-    "bert_config_file", None,
-    "A JSON file with a legacy BERT configuration to define the BERT encoder. "
-    "Exactly one of --bert_config_file and --encoder_config_file can be set. "
-    "Needed for --export_type model and model_with_mlm.")
-flags.DEFINE_bool(
-    "copy_pooler_dense_to_encoder", False,
-    "When the model is trained using `BertPretrainerV2`, the pool layer "
-    "of next sentence prediction task exists in `ClassificationHead` passed "
-    "to `BertPretrainerV2`. If True, we will copy this pooler's dense layer "
-    "to the encoder that is exported by this tool (as in classic BERT). "
-    "Using `BertPretrainerV2` and leaving this False exports an untrained "
-    "(randomly initialized) pooling layer, which some authors recommend for "
-    "subsequent fine-tuning,")
-flags.DEFINE_string(
-    "model_checkpoint_path", None,
-    "File path to a pre-trained model checkpoint. "
-    "For --export_type model, this has to be an object-based (TF2) checkpoint "
-    "that can be restored to `tf.train.Checkpoint(encoder=encoder)` "
-    "for the `encoder` defined by the config file."
-    "(Legacy checkpoints with `model=` instead of `encoder=` are also "
-    "supported for now.) "
-    "For --export_type model_with_mlm, it must be restorable to "
-    "`tf.train.Checkpoint(**BertPretrainerV2(...).checkpoint_items)`. "
-    "(For now, `tf.train.Checkpoint(pretrainer=BertPretrainerV2(...))` is also "
-    "accepted.)")
-flags.DEFINE_string(
-    "vocab_file", None,
-    "For encoders trained on BertTokenzier input: "
-    "the vocabulary file that the encoder model was trained with. "
-    "Exactly one of --vocab_file and --sp_model_file can be set. "
-    "Needed for --export_type model, model_with_mlm and preprocessing.")
-flags.DEFINE_string(
-    "sp_model_file", None,
-    "For encoders trained on SentencepieceTokenzier input: "
-    "the SentencePiece .model file that the encoder model was trained with. "
-    "Exactly one of --vocab_file and --sp_model_file can be set. "
-    "Needed for --export_type model, model_with_mlm and preprocessing.")
-flags.DEFINE_bool(
-    "do_lower_case", None,
-    "Whether to lowercase before tokenization. "
-    "If left as None, and --vocab_file is set, do_lower_case will be enabled "
-    "if 'uncased' appears in the name of --vocab_file. "
-    "If left as None, and --sp_model_file set, do_lower_case defaults to true. "
-    "Needed for --export_type model, model_with_mlm and preprocessing.")
-flags.DEFINE_integer(
-    "default_seq_length", 128,
-    "The sequence length of preprocessing results from "
-    "top-level preprocess method. This is also the default "
-    "sequence length for the bert_pack_inputs subobject."
-    "Needed for --export_type preprocessing.")
-flags.DEFINE_bool(
-    "tokenize_with_offsets", False,  # TODO(b/181866850)
-    "Whether to export a .tokenize_with_offsets subobject for "
-    "--export_type preprocessing.")
-flags.DEFINE_multi_string(
-    "gin_file", default=None,
-    help="List of paths to the config files.")
-flags.DEFINE_multi_string(
-    "gin_params", default=None,
-    help="List of Gin bindings.")
-flags.DEFINE_bool(  # TODO(b/175369555): Remove this flag and its use.
-    "experimental_disable_assert_in_preprocessing", False,
-    "Export a preprocessing model without tf.Assert ops. "
-    "Usually, that would be a bad idea, except TF2.4 has an issue with "
-    "Assert ops in tf.functions used in Dataset.map() on a TPU worker, "
-    "and omitting the Assert ops lets SavedModels avoid the issue.")
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError("Too many command-line arguments.")
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
-
-  if bool(FLAGS.vocab_file) == bool(FLAGS.sp_model_file):
-    raise ValueError("Exactly one of `vocab_file` and `sp_model_file` "
-                     "can be specified, but got %s and %s." %
-                     (FLAGS.vocab_file, FLAGS.sp_model_file))
-  do_lower_case = export_tfhub_lib.get_do_lower_case(
-      FLAGS.do_lower_case, FLAGS.vocab_file, FLAGS.sp_model_file)
-
-  if FLAGS.export_type in ("model", "model_with_mlm"):
-    if bool(FLAGS.bert_config_file) == bool(FLAGS.encoder_config_file):
-      raise ValueError("Exactly one of `bert_config_file` and "
-                       "`encoder_config_file` can be specified, but got "
-                       "%s and %s." %
-                       (FLAGS.bert_config_file, FLAGS.encoder_config_file))
-    if FLAGS.bert_config_file:
-      bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-      encoder_config = None
-    else:
-      bert_config = None
-      encoder_config = encoders.EncoderConfig()
-      encoder_config = hyperparams.override_params_dict(
-          encoder_config, FLAGS.encoder_config_file, is_strict=True)
-    export_tfhub_lib.export_model(
-        FLAGS.export_path,
-        bert_config=bert_config,
-        encoder_config=encoder_config,
-        model_checkpoint_path=FLAGS.model_checkpoint_path,
-        vocab_file=FLAGS.vocab_file,
-        sp_model_file=FLAGS.sp_model_file,
-        do_lower_case=do_lower_case,
-        with_mlm=FLAGS.export_type == "model_with_mlm",
-        copy_pooler_dense_to_encoder=FLAGS.copy_pooler_dense_to_encoder)
-
-  elif FLAGS.export_type == "preprocessing":
-    export_tfhub_lib.export_preprocessing(
-        FLAGS.export_path,
-        vocab_file=FLAGS.vocab_file,
-        sp_model_file=FLAGS.sp_model_file,
-        do_lower_case=do_lower_case,
-        default_seq_length=FLAGS.default_seq_length,
-        tokenize_with_offsets=FLAGS.tokenize_with_offsets,
-        experimental_disable_assert=
-        FLAGS.experimental_disable_assert_in_preprocessing)
-
-  else:
-    raise app.UsageError(
-        "Unknown value '%s' for flag --export_type" % FLAGS.export_type)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/export_tfhub_lib.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/export_tfhub_lib.py
deleted file mode 100644
index 3706c9c0948d65ba16ade0cf56b7375bd81d19a2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/export_tfhub_lib.py
+++ /dev/null
@@ -1,489 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Library of components of export_tfhub.py. See docstring there for more."""
-
-import contextlib
-import hashlib
-import os
-import tempfile
-
-from typing import Optional, Text, Tuple
-
-# Import libraries
-from absl import logging
-import tensorflow as tf
-# pylint: disable=g-direct-tensorflow-import  TODO(b/175369555): Remove these.
-from tensorflow.core.protobuf import saved_model_pb2
-from tensorflow.python.ops import control_flow_ops
-# pylint: enable=g-direct-tensorflow-import
-from official.modeling import tf_utils
-from official.nlp.bert import configs
-from official.nlp.configs import encoders
-from official.nlp.modeling import layers
-from official.nlp.modeling import models
-from official.nlp.modeling import networks
-
-
-def get_bert_encoder(bert_config):
-  """Returns a BertEncoder with dict outputs."""
-  bert_encoder = networks.BertEncoder(
-      vocab_size=bert_config.vocab_size,
-      hidden_size=bert_config.hidden_size,
-      num_layers=bert_config.num_hidden_layers,
-      num_attention_heads=bert_config.num_attention_heads,
-      intermediate_size=bert_config.intermediate_size,
-      activation=tf_utils.get_activation(bert_config.hidden_act),
-      dropout_rate=bert_config.hidden_dropout_prob,
-      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
-      max_sequence_length=bert_config.max_position_embeddings,
-      type_vocab_size=bert_config.type_vocab_size,
-      initializer=tf.keras.initializers.TruncatedNormal(
-          stddev=bert_config.initializer_range),
-      embedding_width=bert_config.embedding_size,
-      dict_outputs=True)
-
-  return bert_encoder
-
-
-def get_do_lower_case(do_lower_case, vocab_file=None, sp_model_file=None):
-  """Returns do_lower_case, replacing None by a guess from vocab file name."""
-  if do_lower_case is not None:
-    return do_lower_case
-  elif vocab_file:
-    do_lower_case = "uncased" in vocab_file
-    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
-                 do_lower_case, vocab_file)
-    return do_lower_case
-  elif sp_model_file:
-    do_lower_case = True  # All public ALBERTs (as of Oct 2020) do it.
-    logging.info("Defaulting to do_lower_case=%s for Sentencepiece tokenizer",
-                 do_lower_case)
-    return do_lower_case
-  else:
-    raise ValueError("Must set vocab_file or sp_model_file.")
-
-
-def _create_model(
-    *,
-    bert_config: Optional[configs.BertConfig] = None,
-    encoder_config: Optional[encoders.EncoderConfig] = None,
-    with_mlm: bool,
-) -> Tuple[tf.keras.Model, tf.keras.Model]:
-  """Creates the model to export and the model to restore the checkpoint.
-
-  Args:
-    bert_config: A legacy `BertConfig` to create a `BertEncoder` object.
-      Exactly one of encoder_config and bert_config must be set.
-    encoder_config: An `EncoderConfig` to create an encoder of the configured
-      type (`BertEncoder` or other).
-    with_mlm: A bool to control the second component of the result.
-      If True, will create a `BertPretrainerV2` object; otherwise, will
-      create a `BertEncoder` object.
-
-  Returns:
-    A Tuple of (1) a Keras model that will be exported, (2) a `BertPretrainerV2`
-    object or `BertEncoder` object depending on the value of `with_mlm`
-    argument, which contains the first model and will be used for restoring
-    weights from the checkpoint.
-  """
-  if (bert_config is not None) == (encoder_config is not None):
-    raise ValueError("Exactly one of `bert_config` and `encoder_config` "
-                     "can be specified, but got %s and %s" %
-                     (bert_config, encoder_config))
-
-  if bert_config is not None:
-    encoder = get_bert_encoder(bert_config)
-  else:
-    encoder = encoders.build_encoder(encoder_config)
-
-  # Convert from list of named inputs to dict of inputs keyed by name.
-  # Only the latter accepts a dict of inputs after restoring from SavedModel.
-  encoder_inputs_dict = {x.name: x for x in encoder.inputs}
-  encoder_output_dict = encoder(encoder_inputs_dict)
-  # For interchangeability with other text representations,
-  # add "default" as an alias for BERT's whole-input reptesentations.
-  encoder_output_dict["default"] = encoder_output_dict["pooled_output"]
-  core_model = tf.keras.Model(
-      inputs=encoder_inputs_dict, outputs=encoder_output_dict)
-
-  if with_mlm:
-    if bert_config is not None:
-      hidden_act = bert_config.hidden_act
-    else:
-      assert encoder_config is not None
-      hidden_act = encoder_config.get().hidden_activation
-
-    pretrainer = models.BertPretrainerV2(
-        encoder_network=encoder,
-        mlm_activation=tf_utils.get_activation(hidden_act))
-
-    pretrainer_inputs_dict = {x.name: x for x in pretrainer.inputs}
-    pretrainer_output_dict = pretrainer(pretrainer_inputs_dict)
-    mlm_model = tf.keras.Model(
-        inputs=pretrainer_inputs_dict, outputs=pretrainer_output_dict)
-    # Set `_auto_track_sub_layers` to False, so that the additional weights
-    # from `mlm` sub-object will not be included in the core model.
-    # TODO(b/169210253): Use a public API when available.
-    core_model._auto_track_sub_layers = False  # pylint: disable=protected-access
-    core_model.mlm = mlm_model
-    return core_model, pretrainer
-  else:
-    return core_model, encoder
-
-
-def export_model(export_path: Text,
-                 *,
-                 bert_config: Optional[configs.BertConfig] = None,
-                 encoder_config: Optional[encoders.EncoderConfig] = None,
-                 model_checkpoint_path: Text,
-                 with_mlm: bool,
-                 copy_pooler_dense_to_encoder: bool = False,
-                 vocab_file: Optional[Text] = None,
-                 sp_model_file: Optional[Text] = None,
-                 do_lower_case: Optional[bool] = None) -> None:
-  """Exports an Encoder as SavedModel after restoring pre-trained weights.
-
-  The exported SavedModel implements a superset of the Encoder API for
-  Text embeddings with Transformer Encoders described at
-  https://www.tensorflow.org/hub/common_saved_model_apis/text.
-
-  In particular, the exported SavedModel can be used in the following way:
-
-  ```
-  # Calls default interface (encoder only).
-
-  encoder = hub.load(...)
-  encoder_inputs = dict(
-      input_word_ids=...,  # Shape [batch, seq_length], dtype=int32
-      input_mask=...,      # Shape [batch, seq_length], dtype=int32
-      input_type_ids=...,  # Shape [batch, seq_length], dtype=int32
-  )
-  encoder_outputs = encoder(encoder_inputs)
-  assert encoder_outputs.keys() == {
-    "pooled_output",   # Shape [batch_size, width], dtype=float32
-    "default",         # Alias for "pooled_output" (aligns with other models).
-    "sequence_output"  # Shape [batch_size, seq_length, width], dtype=float32
-    "encoder_outputs", # List of Tensors with outputs of all transformer layers.
-  }
-  ```
-
-  If `with_mlm` is True, the exported SavedModel can also be called in the
-  following way:
-
-  ```
-  # Calls expanded interface that includes logits of the Masked Language Model.
-  mlm_inputs = dict(
-      input_word_ids=...,       # Shape [batch, seq_length], dtype=int32
-      input_mask=...,           # Shape [batch, seq_length], dtype=int32
-      input_type_ids=...,       # Shape [batch, seq_length], dtype=int32
-      masked_lm_positions=...,  # Shape [batch, num_predictions], dtype=int32
-  )
-  mlm_outputs = encoder.mlm(mlm_inputs)
-  assert mlm_outputs.keys() == {
-    "pooled_output",   # Shape [batch, width], dtype=float32
-    "sequence_output", # Shape [batch, seq_length, width], dtype=float32
-    "encoder_outputs", # List of Tensors with outputs of all transformer layers.
-    "mlm_logits"    # Shape [batch, num_predictions, vocab_size], dtype=float32
-  }
-  ```
-
-  Args:
-    export_path: The SavedModel output directory.
-    bert_config: An optional `configs.BertConfig` object. Note: exactly one of
-      `bert_config` and following `encoder_config` must be specified.
-    encoder_config: An optional `encoders.EncoderConfig` object.
-    model_checkpoint_path: The path to the checkpoint.
-    with_mlm: Whether to export the additional mlm sub-object.
-    copy_pooler_dense_to_encoder: Whether to copy the pooler's dense layer
-      used in the next sentence prediction task to the encoder.
-    vocab_file: The path to the wordpiece vocab file, or None.
-    sp_model_file: The path to the sentencepiece model file, or None.
-      Exactly one of vocab_file and sp_model_file must be set.
-    do_lower_case: Whether to lower-case text before tokenization.
-  """
-  if with_mlm:
-    core_model, pretrainer = _create_model(bert_config=bert_config,
-                                           encoder_config=encoder_config,
-                                           with_mlm=with_mlm)
-    encoder = pretrainer.encoder_network
-    # It supports both the new pretrainer checkpoint produced by TF-NLP and
-    # the checkpoint converted from TF1 (original BERT, SmallBERTs).
-    checkpoint_items = pretrainer.checkpoint_items
-    checkpoint = tf.train.Checkpoint(**checkpoint_items)
-  else:
-    core_model, encoder = _create_model(bert_config=bert_config,
-                                        encoder_config=encoder_config,
-                                        with_mlm=with_mlm)
-    checkpoint = tf.train.Checkpoint(
-        model=encoder,  # Legacy checkpoints.
-        encoder=encoder)
-  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
-
-  if copy_pooler_dense_to_encoder:
-    logging.info("Copy pooler's dense layer to the encoder.")
-    pooler_checkpoint = tf.train.Checkpoint(
-        **{"next_sentence.pooler_dense": encoder.pooler_layer})
-    pooler_checkpoint.restore(
-        model_checkpoint_path).assert_existing_objects_matched()
-
-  # Before SavedModels for preprocessing appeared in Oct 2020, the encoders
-  # provided this information to let users do preprocessing themselves.
-  # We keep doing that for now. It helps users to upgrade incrementally.
-  # Moreover, it offers an escape hatch for advanced users who want the
-  # full vocab, not the high-level operations from the preprocessing model.
-  if vocab_file:
-    core_model.vocab_file = tf.saved_model.Asset(vocab_file)
-    if do_lower_case is None:
-      raise ValueError("Must pass do_lower_case if passing vocab_file.")
-    core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
-  elif sp_model_file:
-    # This was used by ALBERT, with implied values of do_lower_case=True
-    # and strip_diacritics=True.
-    core_model.sp_model_file = tf.saved_model.Asset(sp_model_file)
-  else:
-    raise ValueError("Must set vocab_file or sp_model_file")
-  core_model.save(export_path, include_optimizer=False, save_format="tf")
-
-
-class BertPackInputsSavedModelWrapper(tf.train.Checkpoint):
-  """Wraps a BertPackInputs layer for export to SavedModel.
-
-  The wrapper object is suitable for use with `tf.saved_model.save()` and
-  `.load()`. The wrapper object is callable with inputs and outputs like the
-  BertPackInputs layer, but differs from saving an unwrapped Keras object:
-
-    - The inputs can be a list of 1 or 2 RaggedTensors of dtype int32 and
-      ragged rank 1 or 2. (In Keras, saving to a tf.function in a SavedModel
-      would fix the number of RaggedTensors and their ragged rank.)
-    - The call accepts an optional keyword argument `seq_length=` to override
-      the layer's .seq_length hyperparameter. (In Keras, a hyperparameter
-      could not be changed after saving to a tf.function in a SavedModel.)
-  """
-
-  def __init__(self, bert_pack_inputs: layers.BertPackInputs):
-    super().__init__()
-
-    # Preserve the layer's configured seq_length as a default but make it
-    # overridable. Having this dynamically determined default argument
-    # requires self.__call__ to be defined in this indirect way.
-    default_seq_length = bert_pack_inputs.seq_length
-    @tf.function(autograph=False)
-    def call(inputs, seq_length=default_seq_length):
-      return layers.BertPackInputs.bert_pack_inputs(
-          inputs, seq_length=seq_length,
-          start_of_sequence_id=bert_pack_inputs.start_of_sequence_id,
-          end_of_segment_id=bert_pack_inputs.end_of_segment_id,
-          padding_id=bert_pack_inputs.padding_id)
-    self.__call__ = call
-
-    for ragged_rank in range(1, 3):
-      for num_segments in range(1, 3):
-        _ = self.__call__.get_concrete_function(
-            [tf.RaggedTensorSpec([None] * (ragged_rank + 1), dtype=tf.int32)
-             for _ in range(num_segments)],
-            seq_length=tf.TensorSpec([], tf.int32))
-
-
-def create_preprocessing(*,
-                         vocab_file: Optional[str] = None,
-                         sp_model_file: Optional[str] = None,
-                         do_lower_case: bool,
-                         tokenize_with_offsets: bool,
-                         default_seq_length: int) -> tf.keras.Model:
-  """Returns a preprocessing Model for given tokenization parameters.
-
-  This function builds a Keras Model with attached subobjects suitable for
-  saving to a SavedModel. The resulting SavedModel implements the Preprocessor
-  API for Text embeddings with Transformer Encoders described at
-  https://www.tensorflow.org/hub/common_saved_model_apis/text.
-
-  Args:
-    vocab_file: The path to the wordpiece vocab file, or None.
-    sp_model_file: The path to the sentencepiece model file, or None.
-      Exactly one of vocab_file and sp_model_file must be set.
-      This determines the type of tokenzer that is used.
-    do_lower_case: Whether to do lower case.
-    tokenize_with_offsets: Whether to include the .tokenize_with_offsets
-      subobject.
-    default_seq_length: The sequence length of preprocessing results from
-      root callable. This is also the default sequence length for the
-      bert_pack_inputs subobject.
-
-  Returns:
-    A tf.keras.Model object with several attached subobjects, suitable for
-    saving as a preprocessing SavedModel.
-  """
-  # Select tokenizer.
-  if bool(vocab_file) == bool(sp_model_file):
-    raise ValueError("Must set exactly one of vocab_file, sp_model_file")
-  if vocab_file:
-    tokenize = layers.BertTokenizer(
-        vocab_file=vocab_file,
-        lower_case=do_lower_case,
-        tokenize_with_offsets=tokenize_with_offsets)
-  else:
-    tokenize = layers.SentencepieceTokenizer(
-        model_file_path=sp_model_file,
-        lower_case=do_lower_case,
-        strip_diacritics=True,  #  Strip diacritics to follow ALBERT model.
-        tokenize_with_offsets=tokenize_with_offsets)
-
-  # The root object of the preprocessing model can be called to do
-  # one-shot preprocessing for users with single-sentence inputs.
-  sentences = tf.keras.layers.Input(shape=(), dtype=tf.string, name="sentences")
-  if tokenize_with_offsets:
-    tokens, start_offsets, limit_offsets = tokenize(sentences)
-  else:
-    tokens = tokenize(sentences)
-  pack = layers.BertPackInputs(
-      seq_length=default_seq_length,
-      special_tokens_dict=tokenize.get_special_tokens_dict())
-  model_inputs = pack(tokens)
-  preprocessing = tf.keras.Model(sentences, model_inputs)
-
-  # Individual steps of preprocessing are made available as named subobjects
-  # to enable more general preprocessing. For saving, they need to be Models
-  # in their own right.
-  preprocessing.tokenize = tf.keras.Model(sentences, tokens)
-  # Provide an equivalent to tokenize.get_special_tokens_dict().
-  preprocessing.tokenize.get_special_tokens_dict = tf.train.Checkpoint()
-  preprocessing.tokenize.get_special_tokens_dict.__call__ = tf.function(
-      lambda: tokenize.get_special_tokens_dict(),  # pylint: disable=[unnecessary-lambda]
-      input_signature=[])
-  if tokenize_with_offsets:
-    preprocessing.tokenize_with_offsets = tf.keras.Model(
-        sentences, [tokens, start_offsets, limit_offsets])
-    preprocessing.tokenize_with_offsets.get_special_tokens_dict = (
-        preprocessing.tokenize.get_special_tokens_dict)
-  # Conceptually, this should be
-  # preprocessing.bert_pack_inputs = tf.keras.Model(tokens, model_inputs)
-  # but technicalities require us to use a wrapper (see comments there).
-  # In particular, seq_length can be overridden when calling this.
-  preprocessing.bert_pack_inputs = BertPackInputsSavedModelWrapper(pack)
-
-  return preprocessing
-
-
-def _move_to_tmpdir(file_path: Optional[Text], tmpdir: Text) -> Optional[Text]:
-  """Returns new path with same basename and hash of original path."""
-  if file_path is None: return None
-  olddir, filename = os.path.split(file_path)
-  hasher = hashlib.sha1()
-  hasher.update(olddir.encode("utf-8"))
-  target_dir = os.path.join(tmpdir, hasher.hexdigest())
-  target_file = os.path.join(target_dir, filename)
-  tf.io.gfile.mkdir(target_dir)
-  tf.io.gfile.copy(file_path, target_file)
-  return target_file
-
-
-def export_preprocessing(export_path: Text,
-                         *,
-                         vocab_file: Optional[Text] = None,
-                         sp_model_file: Optional[Text] = None,
-                         do_lower_case: bool,
-                         tokenize_with_offsets: bool,
-                         default_seq_length: int,
-                         experimental_disable_assert: bool = False) -> None:
-  """Exports preprocessing to a SavedModel for TF Hub."""
-  with tempfile.TemporaryDirectory() as tmpdir:
-    # TODO(b/175369555): Remove experimental_disable_assert and its use.
-    with _maybe_disable_assert(experimental_disable_assert):
-      preprocessing = create_preprocessing(
-          vocab_file=_move_to_tmpdir(vocab_file, tmpdir),
-          sp_model_file=_move_to_tmpdir(sp_model_file, tmpdir),
-          do_lower_case=do_lower_case,
-          tokenize_with_offsets=tokenize_with_offsets,
-          default_seq_length=default_seq_length)
-      preprocessing.save(export_path, include_optimizer=False, save_format="tf")
-    if experimental_disable_assert:
-      _check_no_assert(export_path)
-  # It helps the unit test to prevent stray copies of the vocab file.
-  if tf.io.gfile.exists(tmpdir):
-    raise IOError("Failed to clean up TemporaryDirectory")
-
-
-# TODO(b/175369555): Remove all workarounds for this bug of TensorFlow 2.4
-# when this bug is no longer a concern for publishing new models.
-# TensorFlow 2.4 has a placement issue with Assert ops in tf.functions called
-# from Dataset.map() on a TPU worker. They end up on the TPU coordinator,
-# and invoking them from the TPU worker is either inefficient (when possible)
-# or impossible (notably when using "headless" TPU workers on Cloud that do not
-# have a channel to the coordinator). The bug has been fixed in time for TF 2.5.
-# To work around this, the following code avoids Assert ops in the exported
-# SavedModels. It monkey-patches calls to tf.Assert from inside TensorFlow and
-# replaces them by a no-op while building the exported model. This is fragile,
-# so _check_no_assert() validates the result. The resulting model should be fine
-# to read on future versions of TF, even if this workaround at export time
-# may break eventually. (Failing unit tests will tell.)
-
-
-def _dont_assert(condition, data, summarize=None, name="Assert"):
-  """The no-op version of tf.Assert installed by _maybe_disable_assert."""
-  del condition, data, summarize  # Unused.
-  if tf.executing_eagerly():
-    return
-  with tf.name_scope(name):
-    return tf.no_op(name="dont_assert")
-
-
-@contextlib.contextmanager
-def _maybe_disable_assert(disable_assert):
-  """Scoped monkey patch of control_flow_ops.Assert to a no-op."""
-  if not disable_assert:
-    yield
-    return
-
-  original_assert = control_flow_ops.Assert
-  control_flow_ops.Assert = _dont_assert
-  yield
-  control_flow_ops.Assert = original_assert
-
-
-def _check_no_assert(saved_model_path):
-  """Raises AssertionError if SavedModel contains Assert ops."""
-  saved_model_filename = os.path.join(saved_model_path, "saved_model.pb")
-  with tf.io.gfile.GFile(saved_model_filename, "rb") as f:
-    saved_model = saved_model_pb2.SavedModel.FromString(f.read())
-
-  assert_nodes = []
-  graph_def = saved_model.meta_graphs[0].graph_def
-  assert_nodes += ["node '{}' in global graph".format(n.name)
-                   for n in graph_def.node if n.op == "Assert"]
-  for fdef in graph_def.library.function:
-    assert_nodes += [
-        "node '{}' in function '{}'".format(n.name, fdef.signature.name)
-        for n in fdef.node_def if n.op == "Assert"]
-  if assert_nodes:
-    raise AssertionError(
-        "Internal tool error: "
-        "failed to suppress {} Assert ops in SavedModel:\n{}".format(
-            len(assert_nodes), "\n".join(assert_nodes[:10])))
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/export_tfhub_lib_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/export_tfhub_lib_test.py
deleted file mode 100644
index 4b758b7d2004cb104f9a43e1e0f254826a46b345..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/tools/export_tfhub_lib_test.py
+++ /dev/null
@@ -1,1008 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests export_tfhub_lib."""
-
-import os
-import tempfile
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-import tensorflow_hub as hub
-import tensorflow_text as text
-
-from sentencepiece import SentencePieceTrainer
-from official.modeling import tf_utils
-from official.nlp.bert import configs
-from official.nlp.configs import encoders
-from official.nlp.modeling import layers
-from official.nlp.modeling import models
-from official.nlp.tools import export_tfhub_lib
-
-
-def _get_bert_config_or_encoder_config(use_bert_config, hidden_size,
-                                       num_hidden_layers, vocab_size=100):
-  """Returns config args for export_tfhub_lib._create_model()."""
-  if use_bert_config:
-    bert_config = configs.BertConfig(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        intermediate_size=32,
-        max_position_embeddings=128,
-        num_attention_heads=2,
-        num_hidden_layers=num_hidden_layers)
-    encoder_config = None
-  else:
-    bert_config = None
-    encoder_config = encoders.EncoderConfig(
-        type="albert",
-        albert=encoders.AlbertEncoderConfig(
-            vocab_size=vocab_size,
-            embedding_width=16,
-            hidden_size=hidden_size,
-            intermediate_size=32,
-            max_position_embeddings=128,
-            num_attention_heads=2,
-            num_layers=num_hidden_layers,
-            dropout_rate=0.1))
-
-  return bert_config, encoder_config
-
-
-def _get_vocab_or_sp_model_dummy(temp_dir, use_sp_model):
-  """Returns tokenizer asset args for export_tfhub_lib.export_model()."""
-  dummy_file = os.path.join(temp_dir, "dummy_file.txt")
-  with tf.io.gfile.GFile(dummy_file, "w") as f:
-    f.write("dummy content")
-  if use_sp_model:
-    vocab_file, sp_model_file = None, dummy_file
-  else:
-    vocab_file, sp_model_file = dummy_file, None
-  return vocab_file, sp_model_file
-
-
-def _read_asset(asset: tf.saved_model.Asset):
-  return tf.io.gfile.GFile(asset.asset_path.numpy()).read()
-
-
-def _find_lambda_layers(layer):
-  """Returns list of all Lambda layers in a Keras model."""
-  if isinstance(layer, tf.keras.layers.Lambda):
-    return [layer]
-  elif hasattr(layer, "layers"):  # It's nested, like a Model.
-    result = []
-    for l in layer.layers:
-      result += _find_lambda_layers(l)
-    return result
-  else:
-    return []
-
-
-class ExportModelTest(tf.test.TestCase, parameterized.TestCase):
-  """Tests exporting a Transformer Encoder model as a SavedModel.
-
-  This covers export from an Encoder checkpoint to a SavedModel without
-  the .mlm subobject. This is no longer preferred, but still useful
-    for models like Electra that are trained without the MLM task.
-
-  The export code is generic. This test focuses on two main cases
-  (the most important ones in practice when this was written in 2020):
-    - BERT built from a legacy BertConfig, for use with BertTokenizer.
-    - ALBERT built from an EncoderConfig (as a representative of all other
-      choices beyond BERT, for use with SentencepieceTokenizer (the one
-      alternative to BertTokenizer).
-  """
-
-  @parameterized.named_parameters(("Bert", True), ("Albert", False))
-  def test_export_model(self, use_bert):
-    # Create the encoder and export it.
-    hidden_size = 16
-    num_hidden_layers = 1
-    bert_config, encoder_config = _get_bert_config_or_encoder_config(
-        use_bert, hidden_size, num_hidden_layers)
-    bert_model, encoder = export_tfhub_lib._create_model(
-        bert_config=bert_config, encoder_config=encoder_config, with_mlm=False)
-    self.assertEmpty(
-        _find_lambda_layers(bert_model),
-        "Lambda layers are non-portable since they serialize Python bytecode.")
-    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-    checkpoint = tf.train.Checkpoint(encoder=encoder)
-    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
-    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
-
-    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
-        self.get_temp_dir(), use_sp_model=not use_bert)
-    export_path = os.path.join(self.get_temp_dir(), "hub")
-    export_tfhub_lib.export_model(
-        export_path=export_path,
-        bert_config=bert_config,
-        encoder_config=encoder_config,
-        model_checkpoint_path=model_checkpoint_path,
-        with_mlm=False,
-        vocab_file=vocab_file,
-        sp_model_file=sp_model_file,
-        do_lower_case=True)
-
-    # Restore the exported model.
-    hub_layer = hub.KerasLayer(export_path, trainable=True)
-
-    # Check legacy tokenization data.
-    if use_bert:
-      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
-      self.assertEqual("dummy content",
-                       _read_asset(hub_layer.resolved_object.vocab_file))
-      self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
-    else:
-      self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
-      self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
-      self.assertEqual("dummy content",
-                       _read_asset(hub_layer.resolved_object.sp_model_file))
-
-    # Check restored weights.
-    self.assertEqual(len(bert_model.trainable_weights),
-                     len(hub_layer.trainable_weights))
-    for source_weight, hub_weight in zip(bert_model.trainable_weights,
-                                         hub_layer.trainable_weights):
-      self.assertAllClose(source_weight.numpy(), hub_weight.numpy())
-
-    # Check computation.
-    seq_length = 10
-    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
-    input_dict = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    hub_output = hub_layer(input_dict)
-    source_output = bert_model(input_dict)
-    encoder_output = encoder(input_dict)
-    self.assertEqual(hub_output["pooled_output"].shape, (2, hidden_size))
-    self.assertEqual(hub_output["sequence_output"].shape,
-                     (2, seq_length, hidden_size))
-    self.assertLen(hub_output["encoder_outputs"], num_hidden_layers)
-
-    for key in ("pooled_output", "sequence_output", "encoder_outputs"):
-      self.assertAllClose(source_output[key], hub_output[key])
-      self.assertAllClose(source_output[key], encoder_output[key])
-
-    # The "default" output of BERT as a text representation is pooled_output.
-    self.assertAllClose(hub_output["pooled_output"], hub_output["default"])
-
-    # Test that training=True makes a difference (activates dropout).
-    def _dropout_mean_stddev(training, num_runs=20):
-      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
-      input_dict = dict(
-          input_word_ids=input_ids,
-          input_mask=np.ones_like(input_ids),
-          input_type_ids=np.zeros_like(input_ids))
-      outputs = np.concatenate([
-          hub_layer(input_dict, training=training)["pooled_output"]
-          for _ in range(num_runs)
-      ])
-      return np.mean(np.std(outputs, axis=0))
-
-    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
-    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
-
-    # Test propagation of seq_length in shape inference.
-    input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_dict = dict(
-        input_word_ids=input_word_ids,
-        input_mask=input_mask,
-        input_type_ids=input_type_ids)
-    output_dict = hub_layer(input_dict)
-    pooled_output = output_dict["pooled_output"]
-    sequence_output = output_dict["sequence_output"]
-    encoder_outputs = output_dict["encoder_outputs"]
-
-    self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size])
-    self.assertEqual(sequence_output.shape.as_list(),
-                     [None, seq_length, hidden_size])
-    self.assertLen(encoder_outputs, num_hidden_layers)
-
-
-class ExportModelWithMLMTest(tf.test.TestCase, parameterized.TestCase):
-  """Tests exporting a Transformer Encoder model as a SavedModel.
-
-  This covers export from a Pretrainer checkpoint to a SavedModel including
-  the .mlm subobject, which is the preferred way since 2020.
-
-  The export code is generic. This test focuses on two main cases
-  (the most important ones in practice when this was written in 2020):
-    - BERT built from a legacy BertConfig, for use with BertTokenizer.
-    - ALBERT built from an EncoderConfig (as a representative of all other
-      choices beyond BERT, for use with SentencepieceTokenizer (the one
-      alternative to BertTokenizer).
-  """
-
-  def test_copy_pooler_dense_to_encoder(self):
-    encoder_config = encoders.EncoderConfig(
-        type="bert",
-        bert=encoders.BertEncoderConfig(
-            hidden_size=24, intermediate_size=48, num_layers=2))
-    cls_heads = [
-        layers.ClassificationHead(
-            inner_dim=24, num_classes=2, name="next_sentence")
-    ]
-    encoder = encoders.build_encoder(encoder_config)
-    pretrainer = models.BertPretrainerV2(
-        encoder_network=encoder,
-        classification_heads=cls_heads,
-        mlm_activation=tf_utils.get_activation(
-            encoder_config.get().hidden_activation))
-    # Makes sure the pretrainer variables are created.
-    _ = pretrainer(pretrainer.inputs)
-    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
-    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
-
-    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
-        self.get_temp_dir(), use_sp_model=True)
-    export_path = os.path.join(self.get_temp_dir(), "hub")
-    export_tfhub_lib.export_model(
-        export_path=export_path,
-        encoder_config=encoder_config,
-        model_checkpoint_path=tf.train.latest_checkpoint(model_checkpoint_dir),
-        with_mlm=True,
-        copy_pooler_dense_to_encoder=True,
-        vocab_file=vocab_file,
-        sp_model_file=sp_model_file,
-        do_lower_case=True)
-    # Restores a hub KerasLayer.
-    hub_layer = hub.KerasLayer(export_path, trainable=True)
-    dummy_ids = np.zeros((2, 10), dtype=np.int32)
-    input_dict = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    hub_pooled_output = hub_layer(input_dict)["pooled_output"]
-    encoder_outputs = encoder(input_dict)
-    # Verify that hub_layer's pooled_output is the same as the output of next
-    # sentence prediction's dense layer.
-    pretrained_pooled_output = cls_heads[0].dense(
-        (encoder_outputs["sequence_output"][:, 0, :]))
-    self.assertAllClose(hub_pooled_output, pretrained_pooled_output)
-    # But the pooled_output between encoder and hub_layer are not the same.
-    encoder_pooled_output = encoder_outputs["pooled_output"]
-    self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
-
-  @parameterized.named_parameters(
-      ("Bert", True),
-      ("Albert", False),
-  )
-  def test_export_model_with_mlm(self, use_bert):
-    # Create the encoder and export it.
-    hidden_size = 16
-    num_hidden_layers = 2
-    bert_config, encoder_config = _get_bert_config_or_encoder_config(
-        use_bert, hidden_size, num_hidden_layers)
-    bert_model, pretrainer = export_tfhub_lib._create_model(
-        bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
-    self.assertEmpty(
-        _find_lambda_layers(bert_model),
-        "Lambda layers are non-portable since they serialize Python bytecode.")
-    bert_model_with_mlm = bert_model.mlm
-    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-
-    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
-
-    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
-    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
-
-    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
-        self.get_temp_dir(), use_sp_model=not use_bert)
-    export_path = os.path.join(self.get_temp_dir(), "hub")
-    export_tfhub_lib.export_model(
-        export_path=export_path,
-        bert_config=bert_config,
-        encoder_config=encoder_config,
-        model_checkpoint_path=model_checkpoint_path,
-        with_mlm=True,
-        vocab_file=vocab_file,
-        sp_model_file=sp_model_file,
-        do_lower_case=True)
-
-    # Restore the exported model.
-    hub_layer = hub.KerasLayer(export_path, trainable=True)
-
-    # Check legacy tokenization data.
-    if use_bert:
-      self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy())
-      self.assertEqual("dummy content",
-                       _read_asset(hub_layer.resolved_object.vocab_file))
-      self.assertFalse(hasattr(hub_layer.resolved_object, "sp_model_file"))
-    else:
-      self.assertFalse(hasattr(hub_layer.resolved_object, "do_lower_case"))
-      self.assertFalse(hasattr(hub_layer.resolved_object, "vocab_file"))
-      self.assertEqual("dummy content",
-                       _read_asset(hub_layer.resolved_object.sp_model_file))
-
-    # Check restored weights.
-    # Note that we set `_auto_track_sub_layers` to False when exporting the
-    # SavedModel, so hub_layer has the same number of weights as bert_model;
-    # otherwise, hub_layer will have extra weights from its `mlm` subobject.
-    self.assertEqual(len(bert_model.trainable_weights),
-                     len(hub_layer.trainable_weights))
-    for source_weight, hub_weight in zip(bert_model.trainable_weights,
-                                         hub_layer.trainable_weights):
-      self.assertAllClose(source_weight, hub_weight)
-
-    # Check computation.
-    seq_length = 10
-    dummy_ids = np.zeros((2, seq_length), dtype=np.int32)
-    input_dict = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids)
-    hub_outputs_dict = hub_layer(input_dict)
-    source_outputs_dict = bert_model(input_dict)
-    encoder_outputs_dict = pretrainer.encoder_network(
-        [dummy_ids, dummy_ids, dummy_ids])
-    self.assertEqual(hub_outputs_dict["pooled_output"].shape, (2, hidden_size))
-    self.assertEqual(hub_outputs_dict["sequence_output"].shape,
-                     (2, seq_length, hidden_size))
-    for output_key in ("pooled_output", "sequence_output", "encoder_outputs"):
-      self.assertAllClose(source_outputs_dict[output_key],
-                          hub_outputs_dict[output_key])
-      self.assertAllClose(source_outputs_dict[output_key],
-                          encoder_outputs_dict[output_key])
-
-    # The "default" output of BERT as a text representation is pooled_output.
-    self.assertAllClose(hub_outputs_dict["pooled_output"],
-                        hub_outputs_dict["default"])
-
-    # Test that training=True makes a difference (activates dropout).
-    def _dropout_mean_stddev(training, num_runs=20):
-      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
-      input_dict = dict(
-          input_word_ids=input_ids,
-          input_mask=np.ones_like(input_ids),
-          input_type_ids=np.zeros_like(input_ids))
-      outputs = np.concatenate([
-          hub_layer(input_dict, training=training)["pooled_output"]
-          for _ in range(num_runs)
-      ])
-      return np.mean(np.std(outputs, axis=0))
-
-    self.assertLess(_dropout_mean_stddev(training=False), 1e-6)
-    self.assertGreater(_dropout_mean_stddev(training=True), 1e-3)
-
-    # Checks sub-object `mlm`.
-    self.assertTrue(hasattr(hub_layer.resolved_object, "mlm"))
-
-    self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
-                   len(bert_model_with_mlm.trainable_weights))
-    self.assertLen(hub_layer.resolved_object.mlm.trainable_variables,
-                   len(pretrainer.trainable_weights))
-    for source_weight, hub_weight, pretrainer_weight in zip(
-        bert_model_with_mlm.trainable_weights,
-        hub_layer.resolved_object.mlm.trainable_variables,
-        pretrainer.trainable_weights):
-      self.assertAllClose(source_weight, hub_weight)
-      self.assertAllClose(source_weight, pretrainer_weight)
-
-    max_predictions_per_seq = 4
-    mlm_positions = np.zeros((2, max_predictions_per_seq), dtype=np.int32)
-    input_dict = dict(
-        input_word_ids=dummy_ids,
-        input_mask=dummy_ids,
-        input_type_ids=dummy_ids,
-        masked_lm_positions=mlm_positions)
-    hub_mlm_outputs_dict = hub_layer.resolved_object.mlm(input_dict)
-    source_mlm_outputs_dict = bert_model_with_mlm(input_dict)
-    for output_key in ("pooled_output", "sequence_output", "mlm_logits",
-                       "encoder_outputs"):
-      self.assertAllClose(hub_mlm_outputs_dict[output_key],
-                          source_mlm_outputs_dict[output_key])
-
-    pretrainer_mlm_logits_output = pretrainer(input_dict)["mlm_logits"]
-    self.assertAllClose(hub_mlm_outputs_dict["mlm_logits"],
-                        pretrainer_mlm_logits_output)
-
-    # Test that training=True makes a difference (activates dropout).
-    def _dropout_mean_stddev_mlm(training, num_runs=20):
-      input_ids = np.array([[14, 12, 42, 95, 99]], np.int32)
-      mlm_position_ids = np.array([[1, 2, 3, 4]], np.int32)
-      input_dict = dict(
-          input_word_ids=input_ids,
-          input_mask=np.ones_like(input_ids),
-          input_type_ids=np.zeros_like(input_ids),
-          masked_lm_positions=mlm_position_ids)
-      outputs = np.concatenate([
-          hub_layer.resolved_object.mlm(input_dict,
-                                        training=training)["pooled_output"]
-          for _ in range(num_runs)
-      ])
-      return np.mean(np.std(outputs, axis=0))
-
-    self.assertLess(_dropout_mean_stddev_mlm(training=False), 1e-6)
-    self.assertGreater(_dropout_mean_stddev_mlm(training=True), 1e-3)
-
-    # Test propagation of seq_length in shape inference.
-    input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32)
-    input_dict = dict(
-        input_word_ids=input_word_ids,
-        input_mask=input_mask,
-        input_type_ids=input_type_ids)
-    hub_outputs_dict = hub_layer(input_dict)
-    self.assertEqual(hub_outputs_dict["pooled_output"].shape.as_list(),
-                     [None, hidden_size])
-    self.assertEqual(hub_outputs_dict["sequence_output"].shape.as_list(),
-                     [None, seq_length, hidden_size])
-
-
-_STRING_NOT_TO_LEAK = "private_path_component_"
-
-
-class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
-
-  def _make_vocab_file(self, vocab, filename="vocab.txt", add_mask_token=False):
-    """Creates wordpiece vocab file with given words plus special tokens.
-
-    The tokens of the resulting model are, in this order:
-        [PAD], [UNK], [CLS], [SEP], [MASK]*, ...vocab...
-    *=if requested by args.
-
-    This function also accepts wordpieces that start with the ## continuation
-    marker, but avoiding those makes this function interchangeable with
-    _make_sp_model_file(), up to the extra dimension returned by BertTokenizer.
-
-    Args:
-      vocab: a list of strings with the words or wordpieces to put into the
-        model's vocabulary. Do not include special tokens here.
-      filename: Optionally, a filename (relative to the temporary directory
-        created by this function).
-      add_mask_token: an optional bool, whether to include a [MASK] token.
-
-    Returns:
-      The absolute filename of the created vocab file.
-    """
-    full_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]"
-                  ] + ["[MASK]"]*add_mask_token + vocab
-    path = os.path.join(
-        tempfile.mkdtemp(dir=self.get_temp_dir(),  # New subdir each time.
-                         prefix=_STRING_NOT_TO_LEAK),
-        filename)
-    with tf.io.gfile.GFile(path, "w") as f:
-      f.write("\n".join(full_vocab + [""]))
-    return path
-
-  def _make_sp_model_file(self, vocab, prefix="spm", add_mask_token=False):
-    """Creates Sentencepiece word model with given words plus special tokens.
-
-    The tokens of the resulting model are, in this order:
-        <pad>, <unk>, [CLS], [SEP], [MASK]*, ...vocab..., <s>, </s>
-    *=if requested by args.
-
-    The words in the input vocab are plain text, without the whitespace marker.
-    That makes this function interchangeable with _make_vocab_file().
-
-    Args:
-      vocab: a list of strings with the words to put into the model's
-        vocabulary. Do not include special tokens here.
-      prefix: an optional string, to change the filename prefix for the model
-        (relative to the temporary directory created by this function).
-      add_mask_token: an optional bool, whether to include a [MASK] token.
-
-    Returns:
-      The absolute filename of the created Sentencepiece model file.
-    """
-    model_prefix = os.path.join(
-        tempfile.mkdtemp(dir=self.get_temp_dir()),  # New subdir each time.
-        prefix)
-    input_file = model_prefix + "_train_input.txt"
-    # Create input text for training the sp model from the tokens provided.
-    # Repeat tokens, the earlier the more, because they are sorted by frequency.
-    input_text = []
-    for i, token in enumerate(vocab):
-      input_text.append(" ".join([token] * (len(vocab) - i)))
-    with tf.io.gfile.GFile(input_file, "w") as f:
-      f.write("\n".join(input_text + [""]))
-    control_symbols = "[CLS],[SEP]"
-    full_vocab_size = len(vocab) + 6  # <pad>, <unk>, [CLS], [SEP], <s>, </s>.
-    if add_mask_token:
-      control_symbols += ",[MASK]"
-      full_vocab_size += 1
-    flags = dict(
-        model_prefix=model_prefix,
-        model_type="word",
-        input=input_file,
-        pad_id=0, unk_id=1, control_symbols=control_symbols,
-        vocab_size=full_vocab_size,
-        bos_id=full_vocab_size-2, eos_id=full_vocab_size-1)
-    SentencePieceTrainer.Train(
-        " ".join(["--{}={}".format(k, v) for k, v in flags.items()]))
-    return model_prefix + ".model"
-
-  def _do_export(self, vocab, do_lower_case, default_seq_length=128,
-                 tokenize_with_offsets=True, use_sp_model=False,
-                 experimental_disable_assert=False, add_mask_token=False):
-    """Runs SavedModel export and returns the export_path."""
-    export_path = tempfile.mkdtemp(dir=self.get_temp_dir())
-    vocab_file = sp_model_file = None
-    if use_sp_model:
-      sp_model_file = self._make_sp_model_file(vocab,
-                                               add_mask_token=add_mask_token)
-    else:
-      vocab_file = self._make_vocab_file(vocab, add_mask_token=add_mask_token)
-    export_tfhub_lib.export_preprocessing(
-        export_path,
-        vocab_file=vocab_file,
-        sp_model_file=sp_model_file,
-        do_lower_case=do_lower_case,
-        tokenize_with_offsets=tokenize_with_offsets,
-        default_seq_length=default_seq_length,
-        experimental_disable_assert=experimental_disable_assert)
-    # Invalidate the original filename to verify loading from the SavedModel.
-    tf.io.gfile.remove(sp_model_file or vocab_file)
-    return export_path
-
-  def test_no_leaks(self):
-    """Tests not leaking the path to the original vocab file."""
-    path = self._do_export(
-        ["d", "ef", "abc", "xy"], do_lower_case=True, use_sp_model=False)
-    with tf.io.gfile.GFile(os.path.join(path, "saved_model.pb"), "rb") as f:
-      self.assertFalse(  # pylint: disable=g-generic-assert
-          _STRING_NOT_TO_LEAK.encode("ascii") in f.read())
-
-  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
-  def test_exported_callables(self, use_sp_model):
-    preprocess = tf.saved_model.load(self._do_export(
-        ["d", "ef", "abc", "xy"], do_lower_case=True,
-        tokenize_with_offsets=not use_sp_model,  # TODO(b/181866850): drop this.
-        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
-        use_sp_model=use_sp_model))
-
-    def fold_dim(rt):
-      """Removes the word/subword distinction of BertTokenizer."""
-      return rt if use_sp_model else rt.merge_dims(1, 2)
-
-    # .tokenize()
-    inputs = tf.constant(["abc d ef", "ABC D EF d"])
-    token_ids = preprocess.tokenize(inputs)
-    self.assertAllEqual(fold_dim(token_ids),
-                        tf.ragged.constant([[6, 4, 5],
-                                            [6, 4, 5, 4]]))
-
-    special_tokens_dict = {
-        k: v.numpy().item()  # Expecting eager Tensor, converting to Python.
-        for k, v in preprocess.tokenize.get_special_tokens_dict().items()}
-    self.assertDictEqual(special_tokens_dict,
-                         dict(padding_id=0,
-                              start_of_sequence_id=2,
-                              end_of_segment_id=3,
-                              vocab_size=4+6 if use_sp_model else 4+4))
-
-    # .tokenize_with_offsets()
-    if use_sp_model:
-      # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
-      self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
-    else:
-      token_ids, start_offsets, limit_offsets = (
-          preprocess.tokenize_with_offsets(inputs))
-      self.assertAllEqual(fold_dim(token_ids),
-                          tf.ragged.constant([[6, 4, 5],
-                                              [6, 4, 5, 4]]))
-      self.assertAllEqual(fold_dim(start_offsets),
-                          tf.ragged.constant([[0, 4, 6],
-                                              [0, 4, 6, 9]]))
-      self.assertAllEqual(fold_dim(limit_offsets),
-                          tf.ragged.constant([[3, 5, 8],
-                                              [3, 5, 8, 10]]))
-      self.assertIs(preprocess.tokenize.get_special_tokens_dict,
-                    preprocess.tokenize_with_offsets.get_special_tokens_dict)
-
-    # Root callable.
-    bert_inputs = preprocess(inputs)
-    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 128])
-    self.assertAllEqual(bert_inputs["input_word_ids"][:, :10],
-                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
-                                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
-    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 128])
-    self.assertAllEqual(bert_inputs["input_mask"][:, :10],
-                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
-    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 128])
-    self.assertAllEqual(bert_inputs["input_type_ids"][:, :10],
-                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
-
-    # .bert_pack_inputs()
-    inputs_2 = tf.constant(["d xy", "xy abc"])
-    token_ids_2 = preprocess.tokenize(inputs_2)
-    bert_inputs = preprocess.bert_pack_inputs(
-        [token_ids, token_ids_2], seq_length=256)
-    self.assertAllEqual(bert_inputs["input_word_ids"].shape.as_list(), [2, 256])
-    self.assertAllEqual(bert_inputs["input_word_ids"][:, :10],
-                        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
-                                     [2, 6, 4, 5, 4, 3, 7, 6, 3, 0]]))
-    self.assertAllEqual(bert_inputs["input_mask"].shape.as_list(), [2, 256])
-    self.assertAllEqual(bert_inputs["input_mask"][:, :10],
-                        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
-                                     [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]))
-    self.assertAllEqual(bert_inputs["input_type_ids"].shape.as_list(), [2, 256])
-    self.assertAllEqual(bert_inputs["input_type_ids"][:, :10],
-                        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
-                                     [0, 0, 0, 0, 0, 0, 1, 1, 1, 0]]))
-
-  # For BertTokenizer only: repeat relevant parts for do_lower_case=False,
-  # default_seq_length=10, experimental_disable_assert=False,
-  # tokenize_with_offsets=False, and without folding the word/subword dimension.
-  def test_cased_length10(self):
-    preprocess = tf.saved_model.load(self._do_export(
-        ["d", "##ef", "abc", "ABC"],
-        do_lower_case=False, default_seq_length=10,
-        tokenize_with_offsets=False,
-        use_sp_model=False,
-        experimental_disable_assert=False))
-    inputs = tf.constant(["abc def", "ABC DEF"])
-    token_ids = preprocess.tokenize(inputs)
-    self.assertAllEqual(token_ids, tf.ragged.constant([[[6], [4, 5]],
-                                                       [[7], [1]]]))
-
-    self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
-
-    bert_inputs = preprocess(inputs)
-    self.assertAllEqual(bert_inputs["input_word_ids"],
-                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
-                                     [2, 7, 1, 3, 0, 0, 0, 0, 0, 0]]))
-    self.assertAllEqual(bert_inputs["input_mask"],
-                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                                     [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]]))
-    self.assertAllEqual(bert_inputs["input_type_ids"],
-                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
-
-    inputs_2 = tf.constant(["d ABC", "ABC abc"])
-    token_ids_2 = preprocess.tokenize(inputs_2)
-    bert_inputs = preprocess.bert_pack_inputs([token_ids, token_ids_2])
-    # Test default seq_length=10.
-    self.assertAllEqual(bert_inputs["input_word_ids"],
-                        tf.constant([[2, 6, 4, 5, 3, 4, 7, 3, 0, 0],
-                                     [2, 7, 1, 3, 7, 6, 3, 0, 0, 0]]))
-    self.assertAllEqual(bert_inputs["input_mask"],
-                        tf.constant([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
-                                     [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]))
-    self.assertAllEqual(bert_inputs["input_type_ids"],
-                        tf.constant([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
-                                     [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]]))
-
-  # XLA requires fixed shapes for tensors found in graph mode.
-  # Statically known shapes in Python are a particularly firm way to
-  # guarantee that, and they are generally more convenient to work with.
-  # We test that the exported SavedModel plays well with TF's shape
-  # inference when applied to fully or partially known input shapes.
-  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
-  def test_shapes(self, use_sp_model):
-    preprocess = tf.saved_model.load(self._do_export(
-        ["abc", "def"], do_lower_case=True,
-        tokenize_with_offsets=not use_sp_model,  # TODO(b/181866850): drop this.
-        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
-        use_sp_model=use_sp_model))
-
-    def expected_bert_input_shapes(batch_size, seq_length):
-      return dict(input_word_ids=[batch_size, seq_length],
-                  input_mask=[batch_size, seq_length],
-                  input_type_ids=[batch_size, seq_length])
-
-    for batch_size in [7, None]:
-      if use_sp_model:
-        token_out_shape = [batch_size, None]  # No word/subword distinction.
-      else:
-        token_out_shape = [batch_size, None, None]
-      self.assertEqual(
-          _result_shapes_in_tf_function(
-              preprocess.tokenize,
-              tf.TensorSpec([batch_size], tf.string)),
-          token_out_shape,
-          "with batch_size=%s" % batch_size)
-      # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
-      if use_sp_model:
-        self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
-      else:
-        self.assertEqual(
-            _result_shapes_in_tf_function(
-                preprocess.tokenize_with_offsets,
-                tf.TensorSpec([batch_size], tf.string)),
-            [token_out_shape] * 3,
-            "with batch_size=%s" % batch_size)
-      self.assertEqual(
-          _result_shapes_in_tf_function(
-              preprocess.bert_pack_inputs,
-              [tf.RaggedTensorSpec([batch_size, None, None], tf.int32)] * 2,
-              seq_length=256), expected_bert_input_shapes(batch_size, 256),
-          "with batch_size=%s" % batch_size)
-      self.assertEqual(
-          _result_shapes_in_tf_function(preprocess,
-                                        tf.TensorSpec([batch_size], tf.string)),
-          expected_bert_input_shapes(batch_size, 128),
-          "with batch_size=%s" % batch_size)
-
-  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
-  def test_reexport(self, use_sp_model):
-    """Test that preprocess keeps working after another save/load cycle."""
-    path1 = self._do_export(
-        ["d", "ef", "abc", "xy"], do_lower_case=True, default_seq_length=10,
-        tokenize_with_offsets=False,
-        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
-        use_sp_model=use_sp_model)
-    path2 = path1.rstrip("/") + ".2"
-    model1 = tf.saved_model.load(path1)
-    tf.saved_model.save(model1, path2)
-    # Delete the first SavedModel to test that the sceond one loads by itself.
-    # https://github.com/tensorflow/tensorflow/issues/46456 reports such a
-    # failure case for BertTokenizer.
-    tf.io.gfile.rmtree(path1)
-    model2 = tf.saved_model.load(path2)
-
-    inputs = tf.constant(["abc d ef", "ABC D EF d"])
-    bert_inputs = model2(inputs)
-    self.assertAllEqual(bert_inputs["input_word_ids"],
-                        tf.constant([[2, 6, 4, 5, 3, 0, 0, 0, 0, 0],
-                                     [2, 6, 4, 5, 4, 3, 0, 0, 0, 0]]))
-    self.assertAllEqual(bert_inputs["input_mask"],
-                        tf.constant([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                                     [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]))
-    self.assertAllEqual(bert_inputs["input_type_ids"],
-                        tf.constant([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                                     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
-
-  @parameterized.named_parameters(("Bert", True), ("Albert", False))
-  def test_preprocessing_for_mlm(self, use_bert):
-    """Combines both SavedModel types and TF.text helpers for MLM."""
-    # Create the preprocessing SavedModel with a [MASK] token.
-    non_special_tokens = ["hello", "world",
-                          "nice", "movie", "great", "actors",
-                          "quick", "fox", "lazy", "dog"]
-    preprocess = tf.saved_model.load(self._do_export(
-        non_special_tokens, do_lower_case=True,
-        tokenize_with_offsets=use_bert,  # TODO(b/181866850): drop this.
-        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
-        add_mask_token=True, use_sp_model=not use_bert))
-    vocab_size = len(non_special_tokens) + (5 if use_bert else 7)
-
-    # Create the encoder SavedModel with an .mlm subobject.
-    hidden_size = 16
-    num_hidden_layers = 2
-    bert_config, encoder_config = _get_bert_config_or_encoder_config(
-        use_bert, hidden_size, num_hidden_layers, vocab_size)
-    _, pretrainer = export_tfhub_lib._create_model(
-        bert_config=bert_config, encoder_config=encoder_config, with_mlm=True)
-    model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
-    checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
-    checkpoint.save(os.path.join(model_checkpoint_dir, "test"))
-    model_checkpoint_path = tf.train.latest_checkpoint(model_checkpoint_dir)
-    vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(  # Not used below.
-        self.get_temp_dir(), use_sp_model=not use_bert)
-    encoder_export_path = os.path.join(self.get_temp_dir(), "encoder_export")
-    export_tfhub_lib.export_model(
-        export_path=encoder_export_path,
-        bert_config=bert_config,
-        encoder_config=encoder_config,
-        model_checkpoint_path=model_checkpoint_path,
-        with_mlm=True,
-        vocab_file=vocab_file,
-        sp_model_file=sp_model_file,
-        do_lower_case=True)
-    encoder = tf.saved_model.load(encoder_export_path)
-
-    # Get special tokens from the vocab (and vocab size).
-    special_tokens_dict = preprocess.tokenize.get_special_tokens_dict()
-    self.assertEqual(int(special_tokens_dict["vocab_size"]), vocab_size)
-    padding_id = int(special_tokens_dict["padding_id"])
-    self.assertEqual(padding_id, 0)
-    start_of_sequence_id = int(special_tokens_dict["start_of_sequence_id"])
-    self.assertEqual(start_of_sequence_id, 2)
-    end_of_segment_id = int(special_tokens_dict["end_of_segment_id"])
-    self.assertEqual(end_of_segment_id, 3)
-    mask_id = int(special_tokens_dict["mask_id"])
-    self.assertEqual(mask_id, 4)
-
-    # A batch of 3 segment pairs.
-    raw_segments = [tf.constant(["hello", "nice movie", "quick fox"]),
-                    tf.constant(["world", "great actors", "lazy dog"])]
-    batch_size = 3
-
-    # Misc hyperparameters.
-    seq_length = 10
-    max_selections_per_seq = 2
-
-    # Tokenize inputs.
-    tokenized_segments = [preprocess.tokenize(s) for s in raw_segments]
-    # Trim inputs to eventually fit seq_lentgh.
-    num_special_tokens = len(raw_segments) + 1
-    trimmed_segments = text.WaterfallTrimmer(
-        seq_length - num_special_tokens).trim(tokenized_segments)
-    # Combine input segments into one input sequence.
-    input_ids, segment_ids = text.combine_segments(
-        trimmed_segments,
-        start_of_sequence_id=start_of_sequence_id,
-        end_of_segment_id=end_of_segment_id)
-    # Apply random masking controlled by policy objects.
-    (masked_input_ids, masked_lm_positions,
-     masked_ids) = text.mask_language_model(
-         input_ids=input_ids,
-         item_selector=text.RandomItemSelector(
-             max_selections_per_seq,
-             selection_rate=0.5,  # Adjusted for the short test examples.
-             unselectable_ids=[start_of_sequence_id, end_of_segment_id]),
-         mask_values_chooser=text.MaskValuesChooser(
-             vocab_size=vocab_size, mask_token=mask_id,
-             # Always put [MASK] to have a predictable result.
-             mask_token_rate=1.0, random_token_rate=0.0))
-    # Pad to fixed-length Transformer encoder inputs.
-    input_word_ids, _ = text.pad_model_inputs(masked_input_ids,
-                                              seq_length,
-                                              pad_value=padding_id)
-    input_type_ids, input_mask = text.pad_model_inputs(segment_ids, seq_length,
-                                                       pad_value=0)
-    masked_lm_positions, _ = text.pad_model_inputs(masked_lm_positions,
-                                                   max_selections_per_seq,
-                                                   pad_value=0)
-    masked_lm_positions = tf.cast(masked_lm_positions, tf.int32)
-    num_predictions = int(tf.shape(masked_lm_positions)[1])
-
-    # Test transformer inputs.
-    self.assertEqual(num_predictions, max_selections_per_seq)
-    expected_word_ids = np.array([
-        # [CLS] hello [SEP] world [SEP]
-        [2, 5, 3, 6, 3, 0, 0, 0, 0, 0],
-        # [CLS] nice movie [SEP] great actors [SEP]
-        [2, 7, 8, 3, 9, 10, 3, 0, 0, 0],
-        # [CLS] brown fox [SEP] lazy dog [SEP]
-        [2, 11, 12, 3, 13, 14, 3, 0, 0, 0]])
-    for i in range(batch_size):
-      for j in range(num_predictions):
-        k = int(masked_lm_positions[i, j])
-        if k != 0:
-          expected_word_ids[i, k] = 4  # [MASK]
-    self.assertAllEqual(input_word_ids, expected_word_ids)
-
-    # Call the MLM head of the Transformer encoder.
-    mlm_inputs = dict(
-        input_word_ids=input_word_ids,
-        input_mask=input_mask,
-        input_type_ids=input_type_ids,
-        masked_lm_positions=masked_lm_positions,
-    )
-    mlm_outputs = encoder.mlm(mlm_inputs)
-    self.assertEqual(mlm_outputs["pooled_output"].shape,
-                     (batch_size, hidden_size))
-    self.assertEqual(mlm_outputs["sequence_output"].shape,
-                     (batch_size, seq_length, hidden_size))
-    self.assertEqual(mlm_outputs["mlm_logits"].shape,
-                     (batch_size, num_predictions, vocab_size))
-    self.assertLen(mlm_outputs["encoder_outputs"], num_hidden_layers)
-
-    # A real trainer would now compute the loss of mlm_logits
-    # trying to predict the masked_ids.
-    del masked_ids  # Unused.
-
-  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
-  def test_special_tokens_in_estimator(self, use_sp_model):
-    """Tests getting special tokens without an Eager init context."""
-    preprocess_export_path = self._do_export(
-        ["d", "ef", "abc", "xy"], do_lower_case=True,
-        use_sp_model=use_sp_model, tokenize_with_offsets=False)
-
-    def _get_special_tokens_dict(obj):
-      """Returns special tokens of restored tokenizer as Python values."""
-      if tf.executing_eagerly():
-        special_tokens_numpy = {k: v.numpy()
-                                for k, v in obj.get_special_tokens_dict()}
-      else:
-        with tf.Graph().as_default():
-          # This code expects `get_special_tokens_dict()` to be a tf.function
-          # with no dependencies (bound args) from the context it was loaded in,
-          # and boldly assumes that it can just be called in a dfferent context.
-          special_tokens_tensors = obj.get_special_tokens_dict()
-          with tf.compat.v1.Session() as sess:
-            special_tokens_numpy = sess.run(special_tokens_tensors)
-      return {k: v.item()  # Numpy to Python.
-              for k, v in special_tokens_numpy.items()}
-
-    def input_fn():
-      self.assertFalse(tf.executing_eagerly())
-      # Build a preprocessing Model.
-      sentences = tf.keras.layers.Input(shape=[], dtype=tf.string)
-      preprocess = tf.saved_model.load(preprocess_export_path)
-      tokenize = hub.KerasLayer(preprocess.tokenize)
-      special_tokens_dict = _get_special_tokens_dict(tokenize.resolved_object)
-      for k, v in special_tokens_dict.items():
-        self.assertIsInstance(v, int, "Unexpected type for {}".format(k))
-      tokens = tokenize(sentences)
-      packed_inputs = layers.BertPackInputs(
-          4, special_tokens_dict=special_tokens_dict)(tokens)
-      preprocessing = tf.keras.Model(sentences, packed_inputs)
-      # Map the dataset.
-      ds = tf.data.Dataset.from_tensors(
-          (tf.constant(["abc", "D EF"]), tf.constant([0, 1])))
-      ds = ds.map(lambda features, labels: (preprocessing(features), labels))
-      return ds
-
-    def model_fn(features, labels, mode):
-      del labels  # Unused.
-      return tf.estimator.EstimatorSpec(mode=mode,
-                                        predictions=features["input_word_ids"])
-
-    estimator = tf.estimator.Estimator(model_fn=model_fn)
-    outputs = list(estimator.predict(input_fn))
-    self.assertAllEqual(outputs, np.array([[2, 6, 3, 0],
-                                           [2, 4, 5, 3]]))
-
-  # TODO(b/175369555): Remove that code and its test.
-  @parameterized.named_parameters(("Bert", False), ("Sentencepiece", True))
-  def test_check_no_assert(self, use_sp_model):
-    """Tests the self-check during export without assertions."""
-    preprocess_export_path = self._do_export(
-        ["d", "ef", "abc", "xy"], do_lower_case=True,
-        use_sp_model=use_sp_model, tokenize_with_offsets=False,
-        experimental_disable_assert=False)
-    with self.assertRaisesRegex(AssertionError,
-                                r"failed to suppress \d+ Assert ops"):
-      export_tfhub_lib._check_no_assert(preprocess_export_path)
-
-
-def _result_shapes_in_tf_function(fn, *args, **kwargs):
-  """Returns shapes (as lists) observed on the result of `fn`.
-
-  Args:
-    fn: A callable.
-    *args: TensorSpecs for Tensor-valued arguments and actual values
-      for Python-valued arguments to fn.
-    **kwargs: Same for keyword arguments.
-
-  Returns:
-    The nest of partial tensor shapes (as lists) that is statically known inside
-    tf.function(fn)(*args, **kwargs) for the nest of its results.
-  """
-  # Use a captured mutable container for a side outout from the wrapper.
-  uninitialized = "uninitialized!"
-  result_shapes_container = [uninitialized]
-  assert result_shapes_container[0] is uninitialized
-
-  @tf.function
-  def shape_reporting_wrapper(*args, **kwargs):
-    result = fn(*args, **kwargs)
-    result_shapes_container[0] = tf.nest.map_structure(
-        lambda x: x.shape.as_list(), result)
-    return result
-
-  shape_reporting_wrapper.get_concrete_function(*args, **kwargs)
-  assert result_shapes_container[0] is not uninitialized
-  return result_shapes_container[0]
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/train.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/train.py
deleted file mode 100644
index 10ea625bdc73df87399e693f760a4c45815393a3..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/train.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""TFM common training driver."""
-
-from absl import app
-from absl import flags
-import gin
-
-from official.common import distribute_utils
-# pylint: disable=unused-import
-from official.common import registry_imports
-# pylint: enable=unused-import
-from official.common import flags as tfm_flags
-from official.core import task_factory
-from official.core import train_lib
-from official.core import train_utils
-from official.modeling import performance
-
-FLAGS = flags.FLAGS
-
-
-def main(_):
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
-  params = train_utils.parse_configuration(FLAGS)
-  model_dir = FLAGS.model_dir
-  if 'train' in FLAGS.mode:
-    # Pure eval modes do not output yaml files. Otherwise continuous eval job
-    # may race against the train job for writing the same file.
-    train_utils.serialize_config(params, model_dir)
-
-  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
-  # can have significant impact on model speeds by utilizing float16 in case of
-  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
-  # dtype is float16
-  if params.runtime.mixed_precision_dtype:
-    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
-  distribution_strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=params.runtime.distribution_strategy,
-      all_reduce_alg=params.runtime.all_reduce_alg,
-      num_gpus=params.runtime.num_gpus,
-      tpu_address=params.runtime.tpu,
-      **params.runtime.model_parallelism())
-  with distribution_strategy.scope():
-    task = task_factory.get_task(params.task, logging_dir=model_dir)
-
-  train_lib.run_experiment(
-      distribution_strategy=distribution_strategy,
-      task=task,
-      mode=FLAGS.mode,
-      params=params,
-      model_dir=model_dir)
-
-  train_utils.save_gin_config(FLAGS.mode, model_dir)
-
-if __name__ == '__main__':
-  tfm_flags.define_flags()
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/train_ctl_continuous_finetune.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/train_ctl_continuous_finetune.py
deleted file mode 100644
index 41bb7181da19bef9baf572a466a53961f71fcbc2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/train_ctl_continuous_finetune.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""TFM continuous finetuning+eval training driver."""
-from absl import app
-from absl import flags
-import gin
-
-# pylint: disable=unused-import
-from official.common import registry_imports
-# pylint: enable=unused-import
-from official.common import flags as tfm_flags
-from official.core import train_utils
-from official.nlp import continuous_finetune_lib
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_integer(
-    'pretrain_steps',
-    default=None,
-    help='The number of total training steps for the pretraining job.')
-
-
-def main(_):
-  # TODO(b/177863554): consolidate to nlp/train.py
-  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
-  params = train_utils.parse_configuration(FLAGS)
-  model_dir = FLAGS.model_dir
-  train_utils.serialize_config(params, model_dir)
-  continuous_finetune_lib.run_continuous_finetune(
-      FLAGS.mode, params, model_dir, pretrain_steps=FLAGS.pretrain_steps)
-  train_utils.save_gin_config(FLAGS.mode, model_dir)
-
-
-if __name__ == '__main__':
-  tfm_flags.define_flags()
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/pip_package/setup.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/pip_package/setup.py
deleted file mode 100644
index 53a7a7ee677a7acb6ac9b08b0e2f938b12ca6432..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/pip_package/setup.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ==============================================================================
-"""Sets up TensorFlow Official Models."""
-import datetime
-import os
-import sys
-
-from setuptools import find_packages
-from setuptools import setup
-
-version = '2.4.0'
-
-project_name = 'tf-models-official'
-
-long_description = """The TensorFlow official models are a collection of
-models that use TensorFlow's high-level APIs.
-They are intended to be well-maintained, tested, and kept up to date with the
-latest TensorFlow API. They should also be reasonably optimized for fast
-performance while still being easy to read."""
-
-if '--project_name' in sys.argv:
-  project_name_idx = sys.argv.index('--project_name')
-  project_name = sys.argv[project_name_idx + 1]
-  sys.argv.remove('--project_name')
-  sys.argv.pop(project_name_idx)
-
-
-def _get_requirements():
-  """Parses requirements.txt file."""
-  install_requires_tmp = []
-  dependency_links_tmp = []
-  with open(
-      os.path.join(os.path.dirname(__file__), '../requirements.txt'), 'r') as f:
-    for line in f:
-      package_name = line.strip()
-      # Skip empty line or comments starting with "#".
-      if not package_name or package_name[0] == '#':
-        continue
-      if package_name.startswith('-e '):
-        dependency_links_tmp.append(package_name[3:].strip())
-      else:
-        install_requires_tmp.append(package_name)
-  return install_requires_tmp, dependency_links_tmp
-
-install_requires, dependency_links = _get_requirements()
-
-if project_name == 'tf-models-nightly':
-  version += '.dev' + datetime.datetime.now().strftime('%Y%m%d')
-  install_requires.append('tf-nightly')
-  install_requires.append('tensorflow-text-nightly')
-else:
-  install_requires.append('tensorflow>=2.4.0')
-  install_requires.append('tensorflow-text>=2.4.0')
-
-print('install_requires: ', install_requires)
-print('dependency_links: ', dependency_links)
-
-setup(
-    name=project_name,
-    version=version,
-    description='TensorFlow Official Models',
-    long_description=long_description,
-    author='Google Inc.',
-    author_email='no-reply@google.com',
-    url='https://github.com/tensorflow/models',
-    license='Apache 2.0',
-    packages=find_packages(exclude=[
-        'research*',
-        'official.pip_package*',
-        'official.benchmark*',
-        'official.colab*',
-    ]),
-    exclude_package_data={
-        '': ['*_test.py',],
-    },
-    install_requires=install_requires,
-    dependency_links=dependency_links,
-    python_requires='>=3.6',
-)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/requirements.txt b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/requirements.txt
deleted file mode 100644
index 74028adcb55626ea73c5ec728b207ad1e4819519..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/requirements.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-six
-google-api-python-client>=1.6.7
-google-cloud-bigquery>=0.31.0
-kaggle>=1.3.9
-numpy>=1.15.4
-oauth2client
-pandas>=0.22.0
-psutil>=5.4.3
-py-cpuinfo>=3.3.0
-scipy>=0.19.1
-tensorflow-hub>=0.6.0
-tensorflow-model-optimization>=0.4.1
-tensorflow-datasets
-tensorflow-addons
-dataclasses;python_version<"3.7"
-gin-config
-tf_slim>=1.1.0
-Cython
-matplotlib
-pyyaml>=5.1
-# CV related dependencies
-opencv-python-headless
-Pillow
-pycocotools
-# NLP related dependencies
-seqeval
-sentencepiece
-sacrebleu
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/staging/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/staging/__init__.py
deleted file mode 100644
index 9772d6bd74cf0348a137ea4bce7fe8bd29ac9ca1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/staging/__init__.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/staging/training/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/staging/training/__init__.py
deleted file mode 100644
index c7488c33a73b6230267b1de80c99106872f4b188..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/staging/training/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ==============================================================================
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/staging/training/grad_utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/staging/training/grad_utils.py
deleted file mode 100644
index 28f89be1a777e7a7a07518061ac27faa37e422e0..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/staging/training/grad_utils.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ==============================================================================
-"""Some gradient util functions to help users writing custom training loop."""
-
-from absl import logging
-
-import tensorflow as tf
-
-
-def _filter_grads(grads_and_vars):
-  """Filter out iterable with grad equal to None."""
-  grads_and_vars = tuple(grads_and_vars)
-  if not grads_and_vars:
-    return grads_and_vars
-  filtered = []
-  vars_with_empty_grads = []
-  for grad, var in grads_and_vars:
-    if grad is None:
-      vars_with_empty_grads.append(var)
-    else:
-      filtered.append((grad, var))
-  filtered = tuple(filtered)
-  if not filtered:
-    raise ValueError("No gradients provided for any variable: %s." %
-                     ([v.name for _, v in grads_and_vars],))
-  if vars_with_empty_grads:
-    logging.warning(
-        ("Gradients do not exist for variables %s when minimizing the loss."),
-        ([v.name for v in vars_with_empty_grads]))
-  return filtered
-
-
-def _filter_and_allreduce_gradients(grads_and_vars,
-                                    allreduce_precision="float32",
-                                    bytes_per_pack=0):
-  """Filter None grads and then allreduce gradients in specified precision.
-
-  This utils function is used when users intent to explicitly allreduce
-  gradients and customize gradients operations before and after allreduce.
-  The allreduced gradients are then passed to optimizer.apply_gradients(
-  experimental_aggregate_gradients=False).
-
-  Args:
-      grads_and_vars: gradients and variables pairs.
-      allreduce_precision: Whether to allreduce gradients in float32 or float16.
-      bytes_per_pack: A non-negative integer. Breaks collective operations into
-        packs of certain size. If it's zero, all gradients are in one pack.
-
-  Returns:
-      pairs of allreduced non-None gradients and variables.
-  """
-  filtered_grads_and_vars = _filter_grads(grads_and_vars)
-  (grads, variables) = zip(*filtered_grads_and_vars)
-  if allreduce_precision == "float16":
-    grads = [tf.cast(grad, "float16") for grad in grads]
-  hints = tf.distribute.experimental.CommunicationOptions(
-      bytes_per_pack=bytes_per_pack)
-  allreduced_grads = tf.distribute.get_strategy(  # pylint: disable=protected-access
-  ).extended._replica_ctx_all_reduce(tf.distribute.ReduceOp.SUM, grads, hints)
-  if allreduce_precision == "float16":
-    allreduced_grads = [tf.cast(grad, "float32") for grad in allreduced_grads]
-  return allreduced_grads, variables
-
-
-def _run_callbacks(callbacks, grads_and_vars):
-  for callback in callbacks:
-    grads_and_vars = callback(grads_and_vars)
-  return grads_and_vars
-
-
-def minimize_using_explicit_allreduce(tape,
-                                      optimizer,
-                                      loss,
-                                      trainable_variables,
-                                      pre_allreduce_callbacks=None,
-                                      post_allreduce_callbacks=None,
-                                      allreduce_bytes_per_pack=0):
-  """Minimizes loss for one step by updating `trainable_variables`.
-
-  Minimizes loss for one step by updating `trainable_variables`.
-  This explicitly performs gradient allreduce, instead of relying on implicit
-  allreduce in optimizer.apply_gradients(). If training using FP16 mixed
-  precision, explicit allreduce will aggregate gradients in FP16 format.
-  For TPU and GPU training using FP32, explicit allreduce will aggregate
-  gradients in FP32 format.
-
-  Args:
-      tape: An instance of `tf.GradientTape`.
-      optimizer: An instance of `tf.keras.optimizers.Optimizer`.
-      loss: the loss tensor.
-      trainable_variables: A list of model Variables.
-      pre_allreduce_callbacks: A list of callback functions that takes gradients
-        and model variables pairs as input, manipulate them, and returns a new
-        gradients and model variables pairs. The callback functions will be
-        invoked in the list order and before gradients are allreduced. With
-        mixed precision training, the pre_allreduce_allbacks will be applied on
-        scaled_gradients. Default is no callbacks.
-      post_allreduce_callbacks: A list of callback functions that takes
-        gradients and model variables pairs as input, manipulate them, and
-        returns a new gradients and model variables paris. The callback
-        functions will be invoked in the list order and right before gradients
-        are applied to variables for updates. Default is no callbacks.
-      allreduce_bytes_per_pack: A non-negative integer. Breaks collective
-        operations into packs of certain size. If it's zero, all gradients are
-        in one pack.
-  """
-  if isinstance(optimizer,
-                tf.keras.mixed_precision.LossScaleOptimizer):
-    # FP16 GPU code path
-    with tape:
-      scaled_loss = optimizer.get_scaled_loss(loss)
-    scaled_grads = tape.gradient(scaled_loss, trainable_variables)
-    grads_and_vars = zip(scaled_grads, trainable_variables)
-    if pre_allreduce_callbacks:
-      grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars)
-    (allreduced_scaled_grads,
-     filtered_training_vars) = _filter_and_allreduce_gradients(
-         grads_and_vars,
-         allreduce_precision="float16",
-         bytes_per_pack=allreduce_bytes_per_pack)
-    allreduced_unscaled_grads = optimizer.get_unscaled_gradients(
-        allreduced_scaled_grads)
-    grads_and_vars = zip(allreduced_unscaled_grads, filtered_training_vars)
-  else:
-    # TPU or FP32 GPU code path
-    grads = tape.gradient(loss, trainable_variables)
-    grads_and_vars = zip(grads, trainable_variables)
-    if pre_allreduce_callbacks:
-      grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars)
-    (allreduced_grads,
-     filtered_training_vars) = _filter_and_allreduce_gradients(
-         grads_and_vars,
-         allreduce_precision="float32",
-         bytes_per_pack=allreduce_bytes_per_pack)
-    grads_and_vars = zip(allreduced_grads, filtered_training_vars)
-  if post_allreduce_callbacks:
-    grads_and_vars = _run_callbacks(post_allreduce_callbacks, grads_and_vars)
-  optimizer.apply_gradients(
-      grads_and_vars, experimental_aggregate_gradients=False)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/docs/build_api_docs_lib.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/docs/build_api_docs_lib.py
deleted file mode 100644
index 668c3e9e88f48749d426b5ef3b0b60ae538f98a0..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/docs/build_api_docs_lib.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-r"""Common library for API docs builder."""
-
-import tensorflow as tf
-from tensorflow_docs.api_generator import doc_controls
-
-
-def hide_module_model_and_layer_methods():
-  """Hide methods and properties defined in the base classes of Keras layers.
-
-  We hide all methods and properties of the base classes, except:
-  - `__init__` is always documented.
-  - `call` is always documented, as it can carry important information for
-    complex layers.
-  """
-  module_contents = list(tf.Module.__dict__.items())
-  model_contents = list(tf.keras.Model.__dict__.items())
-  layer_contents = list(tf.keras.layers.Layer.__dict__.items())
-
-  for name, obj in module_contents + layer_contents + model_contents:
-    if name == '__init__':
-      # Always document __init__.
-      continue
-
-    if name == 'call':
-      # Always document `call`.
-      if hasattr(obj, doc_controls._FOR_SUBCLASS_IMPLEMENTERS):  # pylint: disable=protected-access
-        delattr(obj, doc_controls._FOR_SUBCLASS_IMPLEMENTERS)  # pylint: disable=protected-access
-      continue
-
-    # Otherwise, exclude from documentation.
-    if isinstance(obj, property):
-      obj = obj.fget
-
-    if isinstance(obj, (staticmethod, classmethod)):
-      obj = obj.__func__
-
-    try:
-      doc_controls.do_not_doc_in_subclasses(obj)
-    except AttributeError:
-      pass
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/docs/build_nlp_api_docs.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/docs/build_nlp_api_docs.py
deleted file mode 100644
index 2c93c02524521dfb8c7f37663c3ba1a9988b41b3..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/docs/build_nlp_api_docs.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-r"""Tool to generate api_docs for tensorflow_models/official library.
-
-Example:
-
-$> pip install -U git+https://github.com/tensorflow/docs
-$> python build_nlp_api_docs \
- --output_dir=/tmp/api_docs
-"""
-
-import os
-
-from absl import app
-from absl import flags
-from absl import logging
-from tensorflow_docs.api_generator import generate_lib
-from tensorflow_docs.api_generator import public_api
-
-from official.nlp import modeling as tfnlp
-import build_api_docs_lib
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string('output_dir', None, 'Where to write the resulting docs to.')
-flags.DEFINE_string(
-    'code_url_prefix',
-    'https://github.com/tensorflow/models/blob/master/official/nlp/modeling/',
-    'The url prefix for links to code.')
-
-flags.DEFINE_bool('search_hints', True,
-                  'Include metadata search hints in the generated files')
-
-flags.DEFINE_string('site_path', '/api_docs/python',
-                    'Path prefix in the _toc.yaml')
-
-flags.DEFINE_bool('gen_report', False,
-                  'Generate an API report containing the health of the '
-                  'docstrings of the public API.')
-
-PROJECT_SHORT_NAME = 'tfnlp'
-PROJECT_FULL_NAME = 'TensorFlow Official Models - NLP Modeling Library'
-
-
-def gen_api_docs(code_url_prefix, site_path, output_dir, gen_report,
-                 project_short_name, project_full_name, search_hints):
-  """Generates api docs for the tensorflow docs package."""
-  build_api_docs_lib.hide_module_model_and_layer_methods()
-  del tfnlp.layers.MultiHeadAttention
-  del tfnlp.layers.EinsumDense
-
-  doc_generator = generate_lib.DocGenerator(
-      root_title=project_full_name,
-      py_modules=[(project_short_name, tfnlp)],
-      base_dir=os.path.dirname(tfnlp.__file__),
-      code_url_prefix=code_url_prefix,
-      search_hints=search_hints,
-      site_path=site_path,
-      gen_report=gen_report,
-      callbacks=[public_api.explicit_package_contents_filter],
-  )
-
-  doc_generator.build(output_dir)
-  logging.info('Output docs to: %s', output_dir)
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError('Too many command-line arguments.')
-
-  gen_api_docs(
-      code_url_prefix=FLAGS.code_url_prefix,
-      site_path=FLAGS.site_path,
-      output_dir=FLAGS.output_dir,
-      gen_report=FLAGS.gen_report,
-      project_short_name=PROJECT_SHORT_NAME,
-      project_full_name=PROJECT_FULL_NAME,
-      search_hints=FLAGS.search_hints)
-
-
-if __name__ == '__main__':
-  flags.mark_flag_as_required('output_dir')
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/docs/build_vision_api_docs.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/docs/build_vision_api_docs.py
deleted file mode 100644
index 79985a324d5c16a9cbfc1b3081a5fe9c49ac3189..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/docs/build_vision_api_docs.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-r"""Tool to generate api_docs for tensorflow_models/official library.
-
-Example:
-
-$> pip install -U git+https://github.com/tensorflow/docs
-$> python build_vision_api_docs \
- --output_dir=/tmp/api_docs
-"""
-
-import os
-
-from absl import app
-from absl import flags
-from absl import logging
-from tensorflow_docs.api_generator import generate_lib
-from tensorflow_docs.api_generator import public_api
-
-import build_api_docs_lib
-from official.vision.beta import modeling as tfvision
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string('output_dir', None, 'Where to write the resulting docs to.')
-flags.DEFINE_string(
-    'code_url_prefix',
-    'https://github.com/tensorflow/models/blob/master/official/vision/beta/modeling/',
-    'The url prefix for links to code.')
-
-flags.DEFINE_bool('search_hints', True,
-                  'Include metadata search hints in the generated files')
-
-flags.DEFINE_string('site_path', 'tfvision/api_docs/python',
-                    'Path prefix in the _toc.yaml')
-
-flags.DEFINE_bool('gen_report', False,
-                  'Generate an API report containing the health of the '
-                  'docstrings of the public API.')
-
-PROJECT_SHORT_NAME = 'tfvision'
-PROJECT_FULL_NAME = 'TensorFlow Official Models - Vision Modeling Library'
-
-
-def gen_api_docs(code_url_prefix, site_path, output_dir, gen_report,
-                 project_short_name, project_full_name, search_hints):
-  """Generates api docs for the tensorflow docs package."""
-  build_api_docs_lib.hide_module_model_and_layer_methods()
-
-  doc_generator = generate_lib.DocGenerator(
-      root_title=project_full_name,
-      py_modules=[(project_short_name, tfvision)],
-      base_dir=os.path.dirname(tfvision.__file__),
-      code_url_prefix=code_url_prefix,
-      search_hints=search_hints,
-      site_path=site_path,
-      gen_report=gen_report,
-      callbacks=[public_api.explicit_package_contents_filter],
-  )
-
-  doc_generator.build(output_dir)
-  logging.info('Output docs to: %s', output_dir)
-
-
-def main(argv):
-  if len(argv) > 1:
-    raise app.UsageError('Too many command-line arguments.')
-
-  gen_api_docs(
-      code_url_prefix=FLAGS.code_url_prefix,
-      site_path=FLAGS.site_path,
-      output_dir=FLAGS.output_dir,
-      gen_report=FLAGS.gen_report,
-      project_short_name=PROJECT_SHORT_NAME,
-      project_full_name=PROJECT_FULL_NAME,
-      search_hints=FLAGS.search_hints)
-
-
-if __name__ == '__main__':
-  flags.mark_flag_as_required('output_dir')
-  app.run(main)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/README.md
deleted file mode 100644
index beb3b2a1e1d8f60feceab52caf25b792902b178e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/README.md
+++ /dev/null
@@ -1,102 +0,0 @@
-# Adding Abseil (absl) flags quickstart
-
-**WARNING** This module is deprecated. We no long use it in new models and
-your projects should not depend on it. We will remove this module when
-all models using it are deprecated which may take time.
-
-## Defining a flag
-absl flag definitions are similar to argparse, although they are defined on a global namespace.
-
-For instance defining a string flag looks like:
-```$xslt
-from absl import flags
-flags.DEFINE_string(
-    name="my_flag",
-    default="a_sensible_default",
-    help="Here is what this flag does."
-)
-```
-
-All three arguments are required, but default may be `None`. A common optional argument is
-short_name for defining abreviations. Certain `DEFINE_*` methods will have other required arguments.
-For instance `DEFINE_enum` requires the `enum_values` argument to be specified.
-
-## Key Flags
-absl has the concept of a key flag. Any flag defined in `__main__` is considered a key flag by
-default. Key flags are displayed in `--help`, others only appear in `--helpfull`. In order to
-handle key flags that are defined outside the module in question, absl provides the
-`flags.adopt_module_key_flags()` method. This adds the key flags of a different module to one's own
-key flags. For example:
-```$xslt
-File: flag_source.py
----------------------------------------
-
-from absl import flags
-flags.DEFINE_string(name="my_flag", default="abc", help="a flag.")
-```
-
-```$xslt
-File: my_module.py
----------------------------------------
-
-from absl import app as absl_app
-from absl import flags
-
-import flag_source
-
-flags.adopt_module_key_flags(flag_source)
-
-def main(_):
-  pass
-
-absl_app.run(main, [__file__, "-h"]
-```
-
-when `my_module.py` is run it will show the help text for `my_flag`. Because not all flags defined
-in a file are equally important, `official/utils/flags/core.py` (generally imported as flags_core)
-provides an abstraction for handling key flag declaration in an easy way through the
-`register_key_flags_in_core()` function, which allows a module to make a single
-`adopt_key_flags(flags_core)` call when using the util flag declaration functions.
-
-## Validators
-Often the constraints on a flag are complicated. absl provides the validator decorator to allow
-one to mark a function as a flag validation function. Suppose we want users to provide a flag
-which is a palindrome.
-
-```$xslt
-from absl import flags
-
-flags.DEFINE_string(name="pal_flag", short_name="pf", default="", help="Give me a palindrome")
-
-@flags.validator("pal_flag")
-def _check_pal(provided_pal_flag):
-  return provided_pal_flag == provided_pal_flag[::-1]
-
-```
-
-Validators take the form that returning True (truthy) passes, and all others 
-(False, None, exception) fail.
-
-## Testing
-To test using absl, simply declare flags in the setupClass method of TensorFlow's TestCase.
-
-```$xslt
-from absl import flags
-import tensorflow as tf
-
-def define_flags():
-  flags.DEFINE_string(name="test_flag", default="abc", help="an example flag")
-
-
-class BaseTester(unittest.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    super(BaseTester, cls).setUpClass()
-    define_flags()
-    
-  def test_trivial(self):
-    flags_core.parse_flags([__file__, "test_flag", "def"])
-    self.AssertEqual(flags.FLAGS.test_flag, "def")
-    
-```
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_base.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_base.py
deleted file mode 100644
index 2fb15af435b7282b9ae7dafee24c1599ecdcd453..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_base.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Flags which will be nearly universal across models."""
-
-from absl import flags
-import tensorflow as tf
-from official.utils.flags._conventions import help_wrap
-
-
-def define_base(data_dir=True,
-                model_dir=True,
-                clean=False,
-                train_epochs=False,
-                epochs_between_evals=False,
-                stop_threshold=False,
-                batch_size=True,
-                num_gpu=False,
-                hooks=False,
-                export_dir=False,
-                distribution_strategy=False,
-                run_eagerly=False):
-  """Register base flags.
-
-  Args:
-    data_dir: Create a flag for specifying the input data directory.
-    model_dir: Create a flag for specifying the model file directory.
-    clean: Create a flag for removing the model_dir.
-    train_epochs: Create a flag to specify the number of training epochs.
-    epochs_between_evals: Create a flag to specify the frequency of testing.
-    stop_threshold: Create a flag to specify a threshold accuracy or other eval
-      metric which should trigger the end of training.
-    batch_size: Create a flag to specify the batch size.
-    num_gpu: Create a flag to specify the number of GPUs used.
-    hooks: Create a flag to specify hooks for logging.
-    export_dir: Create a flag to specify where a SavedModel should be exported.
-    distribution_strategy: Create a flag to specify which Distribution Strategy
-      to use.
-    run_eagerly: Create a flag to specify to run eagerly op by op.
-
-  Returns:
-    A list of flags for core.py to marks as key flags.
-  """
-  key_flags = []
-
-  if data_dir:
-    flags.DEFINE_string(
-        name="data_dir",
-        short_name="dd",
-        default="/tmp",
-        help=help_wrap("The location of the input data."))
-    key_flags.append("data_dir")
-
-  if model_dir:
-    flags.DEFINE_string(
-        name="model_dir",
-        short_name="md",
-        default="/tmp",
-        help=help_wrap("The location of the model checkpoint files."))
-    key_flags.append("model_dir")
-
-  if clean:
-    flags.DEFINE_boolean(
-        name="clean",
-        default=False,
-        help=help_wrap("If set, model_dir will be removed if it exists."))
-    key_flags.append("clean")
-
-  if train_epochs:
-    flags.DEFINE_integer(
-        name="train_epochs",
-        short_name="te",
-        default=1,
-        help=help_wrap("The number of epochs used to train."))
-    key_flags.append("train_epochs")
-
-  if epochs_between_evals:
-    flags.DEFINE_integer(
-        name="epochs_between_evals",
-        short_name="ebe",
-        default=1,
-        help=help_wrap("The number of training epochs to run between "
-                       "evaluations."))
-    key_flags.append("epochs_between_evals")
-
-  if stop_threshold:
-    flags.DEFINE_float(
-        name="stop_threshold",
-        short_name="st",
-        default=None,
-        help=help_wrap("If passed, training will stop at the earlier of "
-                       "train_epochs and when the evaluation metric is  "
-                       "greater than or equal to stop_threshold."))
-
-  if batch_size:
-    flags.DEFINE_integer(
-        name="batch_size",
-        short_name="bs",
-        default=32,
-        help=help_wrap("Batch size for training and evaluation. When using "
-                       "multiple gpus, this is the global batch size for "
-                       "all devices. For example, if the batch size is 32 "
-                       "and there are 4 GPUs, each GPU will get 8 examples on "
-                       "each step."))
-    key_flags.append("batch_size")
-
-  if num_gpu:
-    flags.DEFINE_integer(
-        name="num_gpus",
-        short_name="ng",
-        default=1,
-        help=help_wrap("How many GPUs to use at each worker with the "
-                       "DistributionStrategies API. The default is 1."))
-
-  if run_eagerly:
-    flags.DEFINE_boolean(
-        name="run_eagerly",
-        default=False,
-        help="Run the model op by op without building a model function.")
-
-  if hooks:
-    flags.DEFINE_list(
-        name="hooks",
-        short_name="hk",
-        default="LoggingTensorHook",
-        help=help_wrap(
-            u"A list of (case insensitive) strings to specify the names of "
-            u"training hooks. Example: `--hooks ProfilerHook,"
-            u"ExamplesPerSecondHook`\n See hooks_helper "
-            u"for details."))
-    key_flags.append("hooks")
-
-  if export_dir:
-    flags.DEFINE_string(
-        name="export_dir",
-        short_name="ed",
-        default=None,
-        help=help_wrap("If set, a SavedModel serialization of the model will "
-                       "be exported to this directory at the end of training. "
-                       "See the README for more details and relevant links."))
-    key_flags.append("export_dir")
-
-  if distribution_strategy:
-    flags.DEFINE_string(
-        name="distribution_strategy",
-        short_name="ds",
-        default="mirrored",
-        help=help_wrap("The Distribution Strategy to use for training. "
-                       "Accepted values are 'off', 'one_device', "
-                       "'mirrored', 'parameter_server', 'collective', "
-                       "case insensitive. 'off' means not to use "
-                       "Distribution Strategy; 'default' means to choose "
-                       "from `MirroredStrategy` or `OneDeviceStrategy` "
-                       "according to the number of GPUs."))
-
-  return key_flags
-
-
-def get_num_gpus(flags_obj):
-  """Treat num_gpus=-1 as 'use all'."""
-  if flags_obj.num_gpus != -1:
-    return flags_obj.num_gpus
-
-  from tensorflow.python.client import device_lib  # pylint: disable=g-import-not-at-top
-  local_device_protos = device_lib.list_local_devices()
-  return sum([1 for d in local_device_protos if d.device_type == "GPU"])
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_benchmark.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_benchmark.py
deleted file mode 100644
index 768503d1a75884ec921cc878be7cbb1f5466f595..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_benchmark.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Flags for benchmarking models."""
-
-from absl import flags
-
-from official.utils.flags._conventions import help_wrap
-
-
-def define_log_steps():
-  flags.DEFINE_integer(
-      name="log_steps",
-      default=100,
-      help="Frequency with which to log timing information with TimeHistory.")
-
-  return []
-
-
-def define_benchmark(benchmark_log_dir=True, bigquery_uploader=True):
-  """Register benchmarking flags.
-
-  Args:
-    benchmark_log_dir: Create a flag to specify location for benchmark logging.
-    bigquery_uploader: Create flags for uploading results to BigQuery.
-
-  Returns:
-    A list of flags for core.py to marks as key flags.
-  """
-
-  key_flags = []
-
-  flags.DEFINE_enum(
-      name="benchmark_logger_type",
-      default="BaseBenchmarkLogger",
-      enum_values=["BaseBenchmarkLogger", "BenchmarkFileLogger"],
-      help=help_wrap("The type of benchmark logger to use. Defaults to using "
-                     "BaseBenchmarkLogger which logs to STDOUT. Different "
-                     "loggers will require other flags to be able to work."))
-  flags.DEFINE_string(
-      name="benchmark_test_id",
-      short_name="bti",
-      default=None,
-      help=help_wrap("The unique test ID of the benchmark run. It could be the "
-                     "combination of key parameters. It is hardware "
-                     "independent and could be used compare the performance "
-                     "between different test runs. This flag is designed for "
-                     "human consumption, and does not have any impact within "
-                     "the system."))
-
-  define_log_steps()
-
-  if benchmark_log_dir:
-    flags.DEFINE_string(
-        name="benchmark_log_dir",
-        short_name="bld",
-        default=None,
-        help=help_wrap("The location of the benchmark logging."))
-
-  if bigquery_uploader:
-    flags.DEFINE_string(
-        name="gcp_project",
-        short_name="gp",
-        default=None,
-        help=help_wrap(
-            "The GCP project name where the benchmark will be uploaded."))
-
-    flags.DEFINE_string(
-        name="bigquery_data_set",
-        short_name="bds",
-        default="test_benchmark",
-        help=help_wrap(
-            "The Bigquery dataset name where the benchmark will be uploaded."))
-
-    flags.DEFINE_string(
-        name="bigquery_run_table",
-        short_name="brt",
-        default="benchmark_run",
-        help=help_wrap("The Bigquery table name where the benchmark run "
-                       "information will be uploaded."))
-
-    flags.DEFINE_string(
-        name="bigquery_run_status_table",
-        short_name="brst",
-        default="benchmark_run_status",
-        help=help_wrap("The Bigquery table name where the benchmark run "
-                       "status information will be uploaded."))
-
-    flags.DEFINE_string(
-        name="bigquery_metric_table",
-        short_name="bmt",
-        default="benchmark_metric",
-        help=help_wrap("The Bigquery table name where the benchmark metric "
-                       "information will be uploaded."))
-
-  @flags.multi_flags_validator(
-      ["benchmark_logger_type", "benchmark_log_dir"],
-      message="--benchmark_logger_type=BenchmarkFileLogger will require "
-      "--benchmark_log_dir being set")
-  def _check_benchmark_log_dir(flags_dict):
-    benchmark_logger_type = flags_dict["benchmark_logger_type"]
-    if benchmark_logger_type == "BenchmarkFileLogger":
-      return flags_dict["benchmark_log_dir"]
-    return True
-
-  return key_flags
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_conventions.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_conventions.py
deleted file mode 100644
index 59ab70eab6af3331fd16f2e40b4f645f925fb226..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_conventions.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Central location for shared argparse convention definitions."""
-
-import sys
-import codecs
-import functools
-
-from absl import app as absl_app
-from absl import flags
-
-# This codifies help string conventions and makes it easy to update them if
-# necessary. Currently the only major effect is that help bodies start on the
-# line after flags are listed. All flag definitions should wrap the text bodies
-# with help wrap when calling DEFINE_*.
-_help_wrap = functools.partial(
-    flags.text_wrap, length=80, indent="", firstline_indent="\n")
-
-
-# Pretty formatting causes issues when utf-8 is not installed on a system.
-def _stdout_utf8():
-  try:
-    codecs.lookup("utf-8")
-  except LookupError:
-    return False
-  return getattr(sys.stdout, "encoding", "") == "UTF-8"
-
-
-if _stdout_utf8():
-  help_wrap = _help_wrap
-else:
-
-  def help_wrap(text, *args, **kwargs):
-    return _help_wrap(text, *args, **kwargs).replace(u"\ufeff", u"")
-
-
-# Replace None with h to also allow -h
-absl_app.HelpshortFlag.SHORT_NAME = "h"
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_device.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_device.py
deleted file mode 100644
index 3b5be0d0188f72778c0a5683cd946b290202f8d2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_device.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Flags for managing compute devices. Currently only contains TPU flags."""
-
-from absl import flags
-from absl import logging
-
-from official.utils.flags._conventions import help_wrap
-
-
-def require_cloud_storage(flag_names):
-  """Register a validator to check directory flags.
-
-  Args:
-    flag_names: An iterable of strings containing the names of flags to be
-      checked.
-  """
-  msg = "TPU requires GCS path for {}".format(", ".join(flag_names))
-
-  @flags.multi_flags_validator(["tpu"] + flag_names, message=msg)
-  def _path_check(flag_values):  # pylint: disable=missing-docstring
-    if flag_values["tpu"] is None:
-      return True
-
-    valid_flags = True
-    for key in flag_names:
-      if not flag_values[key].startswith("gs://"):
-        logging.error("%s must be a GCS path.", key)
-        valid_flags = False
-
-    return valid_flags
-
-
-def define_device(tpu=True):
-  """Register device specific flags.
-
-  Args:
-    tpu: Create flags to specify TPU operation.
-
-  Returns:
-    A list of flags for core.py to marks as key flags.
-  """
-
-  key_flags = []
-
-  if tpu:
-    flags.DEFINE_string(
-        name="tpu",
-        default=None,
-        help=help_wrap(
-            "The Cloud TPU to use for training. This should be either the name "
-            "used when creating the Cloud TPU, or a "
-            "grpc://ip.address.of.tpu:8470 url. Passing `local` will use the"
-            "CPU of the local instance instead. (Good for debugging.)"))
-    key_flags.append("tpu")
-
-    flags.DEFINE_string(
-        name="tpu_zone",
-        default=None,
-        help=help_wrap(
-            "[Optional] GCE zone where the Cloud TPU is located in. If not "
-            "specified, we will attempt to automatically detect the GCE "
-            "project from metadata."))
-
-    flags.DEFINE_string(
-        name="tpu_gcp_project",
-        default=None,
-        help=help_wrap(
-            "[Optional] Project name for the Cloud TPU-enabled project. If not "
-            "specified, we will attempt to automatically detect the GCE "
-            "project from metadata."))
-
-    flags.DEFINE_integer(
-        name="num_tpu_shards",
-        default=8,
-        help=help_wrap("Number of shards (TPU chips)."))
-
-  return key_flags
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_distribution.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_distribution.py
deleted file mode 100644
index a15afa79aeca979f20247bc61d2c5c0ca58fa7e2..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_distribution.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Flags related to distributed execution."""
-
-from absl import flags
-import tensorflow as tf
-
-from official.utils.flags._conventions import help_wrap
-
-
-def define_distribution(worker_hosts=True, task_index=True):
-  """Register distributed execution flags.
-
-  Args:
-    worker_hosts: Create a flag for specifying comma-separated list of workers.
-    task_index: Create a flag for specifying index of task.
-
-  Returns:
-    A list of flags for core.py to marks as key flags.
-  """
-  key_flags = []
-
-  if worker_hosts:
-    flags.DEFINE_string(
-        name='worker_hosts',
-        default=None,
-        help=help_wrap(
-            'Comma-separated list of worker ip:port pairs for running '
-            'multi-worker models with DistributionStrategy.  The user would '
-            'start the program on each host with identical value for this '
-            'flag.'))
-
-  if task_index:
-    flags.DEFINE_integer(
-        name='task_index',
-        default=-1,
-        help=help_wrap('If multi-worker training, the task_index of this '
-                       'worker.'))
-
-  return key_flags
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_misc.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_misc.py
deleted file mode 100644
index 79e7db20b56af0a4968549ffa245d6abe556bdda..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_misc.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Misc flags."""
-
-from absl import flags
-
-from official.utils.flags._conventions import help_wrap
-
-
-def define_image(data_format=True):
-  """Register image specific flags.
-
-  Args:
-    data_format: Create a flag to specify image axis convention.
-
-  Returns:
-    A list of flags for core.py to marks as key flags.
-  """
-
-  key_flags = []
-
-  if data_format:
-    flags.DEFINE_enum(
-        name="data_format",
-        short_name="df",
-        default=None,
-        enum_values=["channels_first", "channels_last"],
-        help=help_wrap(
-            "A flag to override the data format used in the model. "
-            "channels_first provides a performance boost on GPU but is not "
-            "always compatible with CPU. If left unspecified, the data format "
-            "will be chosen automatically based on whether TensorFlow was "
-            "built for CPU or GPU."))
-    key_flags.append("data_format")
-
-  return key_flags
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_performance.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_performance.py
deleted file mode 100644
index 8c35775cced4bab41b7304f8c2a8c5217713496d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/_performance.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Register flags for optimizing performance."""
-
-import multiprocessing
-
-from absl import flags  # pylint: disable=g-bad-import-order
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.utils.flags._conventions import help_wrap
-
-# Map string to TensorFlow dtype
-DTYPE_MAP = {
-    "fp16": tf.float16,
-    "bf16": tf.bfloat16,
-    "fp32": tf.float32,
-}
-
-
-def get_tf_dtype(flags_obj):
-  if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
-    # If the graph_rewrite is used, we build the graph with fp32, and let the
-    # graph rewrite change ops to fp16.
-    return tf.float32
-  return DTYPE_MAP[flags_obj.dtype]
-
-
-def get_loss_scale(flags_obj, default_for_fp16):
-  dtype = get_tf_dtype(flags_obj)
-  if flags_obj.loss_scale == "dynamic":
-    return flags_obj.loss_scale
-  elif flags_obj.loss_scale is not None:
-    return float(flags_obj.loss_scale)
-  elif dtype == tf.float32 or dtype == tf.bfloat16:
-    return 1  # No loss scaling is needed for fp32
-  else:
-    assert dtype == tf.float16
-    return default_for_fp16
-
-
-def define_performance(num_parallel_calls=False,
-                       inter_op=False,
-                       intra_op=False,
-                       synthetic_data=False,
-                       max_train_steps=False,
-                       dtype=False,
-                       all_reduce_alg=False,
-                       num_packs=False,
-                       tf_gpu_thread_mode=False,
-                       datasets_num_private_threads=False,
-                       datasets_num_parallel_batches=False,
-                       fp16_implementation=False,
-                       loss_scale=False,
-                       tf_data_experimental_slack=False,
-                       enable_xla=False,
-                       training_dataset_cache=False):
-  """Register flags for specifying performance tuning arguments.
-
-  Args:
-    num_parallel_calls: Create a flag to specify parallelism of data loading.
-    inter_op: Create a flag to allow specification of inter op threads.
-    intra_op: Create a flag to allow specification of intra op threads.
-    synthetic_data: Create a flag to allow the use of synthetic data.
-    max_train_steps: Create a flags to allow specification of maximum number of
-      training steps
-    dtype: Create flags for specifying dtype.
-    all_reduce_alg: If set forces a specific algorithm for multi-gpu.
-    num_packs: If set provides number of packs for MirroredStrategy's cross
-      device ops.
-    tf_gpu_thread_mode: gpu_private triggers us of private thread pool.
-    datasets_num_private_threads: Number of private threads for datasets.
-    datasets_num_parallel_batches: Determines how many batches to process in
-      parallel when using map and batch from tf.data.
-    fp16_implementation: Create fp16_implementation flag.
-    loss_scale: Controls the loss scaling, normally for mixed-precision
-      training. Can only be turned on if dtype is also True.
-    tf_data_experimental_slack: Determines whether to enable tf.data's
-      `experimental_slack` option.
-    enable_xla: Determines if XLA (auto clustering) is turned on.
-    training_dataset_cache: Whether to cache the training dataset on workers.
-      Typically used to improve training performance when training data is in
-      remote storage and can fit into worker memory.
-
-  Returns:
-    A list of flags for core.py to marks as key flags.
-  """
-
-  key_flags = []
-  if num_parallel_calls:
-    flags.DEFINE_integer(
-        name="num_parallel_calls",
-        short_name="npc",
-        default=multiprocessing.cpu_count(),
-        help=help_wrap("The number of records that are  processed in parallel "
-                       "during input processing. This can be optimized per "
-                       "data set but for generally homogeneous data sets, "
-                       "should be approximately the number of available CPU "
-                       "cores. (default behavior)"))
-
-  if inter_op:
-    flags.DEFINE_integer(
-        name="inter_op_parallelism_threads",
-        short_name="inter",
-        default=0,
-        help=help_wrap("Number of inter_op_parallelism_threads to use for CPU. "
-                       "See TensorFlow config.proto for details."))
-
-  if intra_op:
-    flags.DEFINE_integer(
-        name="intra_op_parallelism_threads",
-        short_name="intra",
-        default=0,
-        help=help_wrap("Number of intra_op_parallelism_threads to use for CPU. "
-                       "See TensorFlow config.proto for details."))
-
-  if synthetic_data:
-    flags.DEFINE_bool(
-        name="use_synthetic_data",
-        short_name="synth",
-        default=False,
-        help=help_wrap(
-            "If set, use fake data (zeroes) instead of a real dataset. "
-            "This mode is useful for performance debugging, as it removes "
-            "input processing steps, but will not learn anything."))
-
-  if max_train_steps:
-    flags.DEFINE_integer(
-        name="max_train_steps",
-        short_name="mts",
-        default=None,
-        help=help_wrap(
-            "The model will stop training if the global_step reaches this "
-            "value. If not set, training will run until the specified number "
-            "of epochs have run as usual. It is generally recommended to set "
-            "--train_epochs=1 when using this flag."))
-
-  if dtype:
-    flags.DEFINE_enum(
-        name="dtype",
-        short_name="dt",
-        default="fp32",
-        enum_values=DTYPE_MAP.keys(),
-        help=help_wrap("The TensorFlow datatype used for calculations. "
-                       "For 16-bit dtypes, variables and certain ops will "
-                       "still be float32 for numeric stability."))
-
-    if loss_scale:
-      flags.DEFINE_string(
-          name="loss_scale",
-          short_name="ls",
-          default=None,
-          help=help_wrap(
-              "The amount to scale the loss by when --dtype=fp16. This can be "
-              "an int/float or the string 'dynamic'. Before gradients are "
-              "computed, the loss is multiplied by the loss scale, making all "
-              "gradients loss_scale times larger. To adjust for this, "
-              "gradients are divided by the loss scale before being applied to "
-              "variables. This is mathematically equivalent to training "
-              "without a loss scale, but the loss scale helps avoid some "
-              "intermediate gradients from underflowing to zero. The default "
-              "is 'dynamic', which dynamic determines the optimal loss scale "
-              "during training."))
-
-      # pylint: disable=unused-variable
-      @flags.validator(
-          flag_name="loss_scale",
-          message="loss_scale should be a positive int/float or the string "
-                  "'dynamic'.")
-      def _check_loss_scale(loss_scale):
-        """Validator to check the loss scale flag is valid."""
-        if loss_scale is None:
-          return True  # null case is handled in get_loss_scale()
-
-        if loss_scale == "dynamic":
-          return True
-
-        try:
-          loss_scale = float(loss_scale)
-        except ValueError:
-          return False
-
-        return loss_scale > 0
-      # pylint: enable=unused-variable
-
-    if fp16_implementation:
-      flags.DEFINE_enum(
-          name="fp16_implementation",
-          default="keras",
-          enum_values=("keras', 'graph_rewrite"),
-          help=help_wrap(
-              "When --dtype=fp16, how fp16 should be implemented. This has no "
-              "impact on correctness. 'keras' uses the "
-              "tf.keras.mixed_precision API. 'graph_rewrite' uses the "
-              "tf.compat.v1.mixed_precision."
-              "enable_mixed_precision_graph_rewrite API."))
-
-      @flags.multi_flags_validator(
-          ["fp16_implementation", "dtype", "loss_scale"])
-      def _check_fp16_implementation(flags_dict):
-        """Validator to check fp16_implementation flag is valid."""
-        if (flags_dict["fp16_implementation"] == "graph_rewrite" and
-            flags_dict["dtype"] != "fp16"):
-          raise flags.ValidationError("--fp16_implementation should not be "
-                                      "specified unless --dtype=fp16")
-        return True
-
-  if all_reduce_alg:
-    flags.DEFINE_string(
-        name="all_reduce_alg",
-        short_name="ara",
-        default=None,
-        help=help_wrap("Defines the algorithm to use for performing all-reduce."
-                       "When specified with MirroredStrategy for single "
-                       "worker, this controls "
-                       "tf.contrib.distribute.AllReduceCrossTowerOps.  When "
-                       "specified with MultiWorkerMirroredStrategy, this "
-                       "controls "
-                       "tf.distribute.experimental.CollectiveCommunication; "
-                       "valid options are `ring` and `nccl`."))
-
-  if num_packs:
-    flags.DEFINE_integer(
-        name="num_packs",
-        default=1,
-        help=help_wrap("Sets `num_packs` in the cross device ops used in "
-                       "MirroredStrategy.  For details, see "
-                       "tf.distribute.NcclAllReduce."))
-
-  if tf_gpu_thread_mode:
-    flags.DEFINE_string(
-        name="tf_gpu_thread_mode",
-        short_name="gt_mode",
-        default=None,
-        help=help_wrap(
-            "Whether and how the GPU device uses its own threadpool."))
-
-    flags.DEFINE_integer(
-        name="per_gpu_thread_count",
-        short_name="pgtc",
-        default=0,
-        help=help_wrap("The number of threads to use for GPU. Only valid when "
-                       "tf_gpu_thread_mode is not global."))
-
-  if datasets_num_private_threads:
-    flags.DEFINE_integer(
-        name="datasets_num_private_threads",
-        default=None,
-        help=help_wrap(
-            "Number of threads for a private threadpool created for all"
-            "datasets computation.."))
-
-  if datasets_num_parallel_batches:
-    flags.DEFINE_integer(
-        name="datasets_num_parallel_batches",
-        default=None,
-        help=help_wrap(
-            "Determines how many batches to process in parallel when using "
-            "map and batch from tf.data."))
-
-  if training_dataset_cache:
-    flags.DEFINE_boolean(
-        name="training_dataset_cache",
-        default=False,
-        help=help_wrap(
-            "Determines whether to cache the training dataset on workers. "
-            "Typically used to improve training performance when training "
-            "data is in remote storage and can fit into worker memory."))
-
-  if tf_data_experimental_slack:
-    flags.DEFINE_boolean(
-        name="tf_data_experimental_slack",
-        default=False,
-        help=help_wrap(
-            "Whether to enable tf.data's `experimental_slack` option."))
-
-  if enable_xla:
-    flags.DEFINE_boolean(
-        name="enable_xla",
-        default=False,
-        help="Whether to enable XLA auto jit compilation")
-
-  return key_flags
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/core.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/core.py
deleted file mode 100644
index 41feb4944f0cc4d550d4b53655bdb7ee0d69d23a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/core.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Public interface for flag definition.
-
-See _example.py for detailed instructions on defining flags.
-"""
-
-import sys
-
-from six.moves import shlex_quote
-
-from absl import app as absl_app
-from absl import flags
-
-from official.utils.flags import _base
-from official.utils.flags import _benchmark
-from official.utils.flags import _conventions
-from official.utils.flags import _device
-from official.utils.flags import _distribution
-from official.utils.flags import _misc
-from official.utils.flags import _performance
-
-
-def set_defaults(**kwargs):
-  for key, value in kwargs.items():
-    flags.FLAGS.set_default(name=key, value=value)
-
-
-def parse_flags(argv=None):
-  """Reset flags and reparse. Currently only used in testing."""
-  flags.FLAGS.unparse_flags()
-  absl_app.parse_flags_with_usage(argv or sys.argv)
-
-
-def register_key_flags_in_core(f):
-  """Defines a function in core.py, and registers its key flags.
-
-  absl uses the location of a flags.declare_key_flag() to determine the context
-  in which a flag is key. By making all declares in core, this allows model
-  main functions to call flags.adopt_module_key_flags() on core and correctly
-  chain key flags.
-
-  Args:
-    f:  The function to be wrapped
-
-  Returns:
-    The "core-defined" version of the input function.
-  """
-
-  def core_fn(*args, **kwargs):
-    key_flags = f(*args, **kwargs)
-    [flags.declare_key_flag(fl) for fl in key_flags]  # pylint: disable=expression-not-assigned
-
-  return core_fn
-
-
-define_base = register_key_flags_in_core(_base.define_base)
-# We have define_base_eager for compatibility, since it used to be a separate
-# function from define_base.
-define_base_eager = define_base
-define_log_steps = register_key_flags_in_core(_benchmark.define_log_steps)
-define_benchmark = register_key_flags_in_core(_benchmark.define_benchmark)
-define_device = register_key_flags_in_core(_device.define_device)
-define_image = register_key_flags_in_core(_misc.define_image)
-define_performance = register_key_flags_in_core(_performance.define_performance)
-define_distribution = register_key_flags_in_core(
-    _distribution.define_distribution)
-
-help_wrap = _conventions.help_wrap
-
-get_num_gpus = _base.get_num_gpus
-get_tf_dtype = _performance.get_tf_dtype
-get_loss_scale = _performance.get_loss_scale
-DTYPE_MAP = _performance.DTYPE_MAP
-require_cloud_storage = _device.require_cloud_storage
-
-
-def _get_nondefault_flags_as_dict():
-  """Returns the nondefault flags as a dict from flag name to value."""
-  nondefault_flags = {}
-  for flag_name in flags.FLAGS:
-    flag_value = getattr(flags.FLAGS, flag_name)
-    if (flag_name != flags.FLAGS[flag_name].short_name and
-        flag_value != flags.FLAGS[flag_name].default):
-      nondefault_flags[flag_name] = flag_value
-  return nondefault_flags
-
-
-def get_nondefault_flags_as_str():
-  """Returns flags as a string that can be passed as command line arguments.
-
-  E.g., returns: "--batch_size=256 --use_synthetic_data" for the following code
-  block:
-
-  ```
-  flags.FLAGS.batch_size = 256
-  flags.FLAGS.use_synthetic_data = True
-  print(get_nondefault_flags_as_str())
-  ```
-
-  Only flags with nondefault values are returned, as passing default flags as
-  command line arguments has no effect.
-
-  Returns:
-    A string with the flags, that can be passed as command line arguments to a
-    program to use the flags.
-  """
-  nondefault_flags = _get_nondefault_flags_as_dict()
-  flag_strings = []
-  for name, value in sorted(nondefault_flags.items()):
-    if isinstance(value, bool):
-      flag_str = '--{}'.format(name) if value else '--no{}'.format(name)
-    elif isinstance(value, list):
-      flag_str = '--{}={}'.format(name, ','.join(value))
-    else:
-      flag_str = '--{}={}'.format(name, value)
-    flag_strings.append(flag_str)
-  return ' '.join(shlex_quote(flag_str) for flag_str in flag_strings)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/flags_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/flags_test.py
deleted file mode 100644
index cb06c07675c73f776632e12922d80dbd0494b20d..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/flags_test.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import unittest
-
-from absl import flags
-import tensorflow as tf
-
-from official.utils.flags import core as flags_core  # pylint: disable=g-bad-import-order
-
-
-def define_flags():
-  flags_core.define_base(
-      clean=True,
-      num_gpu=False,
-      stop_threshold=True,
-      hooks=True,
-      train_epochs=True,
-      epochs_between_evals=True)
-  flags_core.define_performance(
-      num_parallel_calls=True,
-      inter_op=True,
-      intra_op=True,
-      loss_scale=True,
-      synthetic_data=True,
-      dtype=True)
-  flags_core.define_image()
-  flags_core.define_benchmark()
-
-
-class BaseTester(unittest.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    super(BaseTester, cls).setUpClass()
-    define_flags()
-
-  def test_default_setting(self):
-    """Test to ensure fields exist and defaults can be set."""
-
-    defaults = dict(
-        data_dir="dfgasf",
-        model_dir="dfsdkjgbs",
-        train_epochs=534,
-        epochs_between_evals=15,
-        batch_size=256,
-        hooks=["LoggingTensorHook"],
-        num_parallel_calls=18,
-        inter_op_parallelism_threads=5,
-        intra_op_parallelism_threads=10,
-        data_format="channels_first")
-
-    flags_core.set_defaults(**defaults)
-    flags_core.parse_flags()
-
-    for key, value in defaults.items():
-      assert flags.FLAGS.get_flag_value(name=key, default=None) == value
-
-  def test_benchmark_setting(self):
-    defaults = dict(
-        hooks=["LoggingMetricHook"],
-        benchmark_log_dir="/tmp/12345",
-        gcp_project="project_abc",
-    )
-
-    flags_core.set_defaults(**defaults)
-    flags_core.parse_flags()
-
-    for key, value in defaults.items():
-      assert flags.FLAGS.get_flag_value(name=key, default=None) == value
-
-  def test_booleans(self):
-    """Test to ensure boolean flags trigger as expected."""
-
-    flags_core.parse_flags([__file__, "--use_synthetic_data"])
-
-    assert flags.FLAGS.use_synthetic_data
-
-  def test_parse_dtype_info(self):
-    flags_core.parse_flags([__file__, "--dtype", "fp16"])
-    self.assertEqual(flags_core.get_tf_dtype(flags.FLAGS), tf.float16)
-    self.assertEqual(
-        flags_core.get_loss_scale(flags.FLAGS, default_for_fp16=2), 2)
-
-    flags_core.parse_flags([__file__, "--dtype", "fp16", "--loss_scale", "5"])
-    self.assertEqual(
-        flags_core.get_loss_scale(flags.FLAGS, default_for_fp16=2), 5)
-
-    flags_core.parse_flags(
-        [__file__, "--dtype", "fp16", "--loss_scale", "dynamic"])
-    self.assertEqual(
-        flags_core.get_loss_scale(flags.FLAGS, default_for_fp16=2), "dynamic")
-
-    flags_core.parse_flags([__file__, "--dtype", "fp32"])
-    self.assertEqual(flags_core.get_tf_dtype(flags.FLAGS), tf.float32)
-    self.assertEqual(
-        flags_core.get_loss_scale(flags.FLAGS, default_for_fp16=2), 1)
-
-    flags_core.parse_flags([__file__, "--dtype", "fp32", "--loss_scale", "5"])
-    self.assertEqual(
-        flags_core.get_loss_scale(flags.FLAGS, default_for_fp16=2), 5)
-
-    with self.assertRaises(SystemExit):
-      flags_core.parse_flags([__file__, "--dtype", "int8"])
-
-    with self.assertRaises(SystemExit):
-      flags_core.parse_flags(
-          [__file__, "--dtype", "fp16", "--loss_scale", "abc"])
-
-  def test_get_nondefault_flags_as_str(self):
-    defaults = dict(
-        clean=True,
-        data_dir="abc",
-        hooks=["LoggingTensorHook"],
-        stop_threshold=1.5,
-        use_synthetic_data=False)
-    flags_core.set_defaults(**defaults)
-    flags_core.parse_flags()
-
-    expected_flags = ""
-    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
-
-    flags.FLAGS.clean = False
-    expected_flags += "--noclean"
-    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
-
-    flags.FLAGS.data_dir = "xyz"
-    expected_flags += " --data_dir=xyz"
-    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
-
-    flags.FLAGS.hooks = ["aaa", "bbb", "ccc"]
-    expected_flags += " --hooks=aaa,bbb,ccc"
-    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
-
-    flags.FLAGS.stop_threshold = 3.
-    expected_flags += " --stop_threshold=3.0"
-    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
-
-    flags.FLAGS.use_synthetic_data = True
-    expected_flags += " --use_synthetic_data"
-    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
-
-    # Assert that explicit setting a flag to its default value does not cause it
-    # to appear in the string
-    flags.FLAGS.use_synthetic_data = False
-    expected_flags = expected_flags[:-len(" --use_synthetic_data")]
-    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
-
-
-if __name__ == "__main__":
-  unittest.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/guidelines.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/guidelines.md
deleted file mode 100644
index db963aabebccad8614a1b59ea7ff9b828bcee3b4..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/flags/guidelines.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# Using flags in official models
-
-1. **All common flags must be incorporated in the models.**
-
-   Common flags (i.e. batch_size, model_dir, etc.) are provided by various flag definition functions,
-   and channeled through `official.utils.flags.core`. For instance to define common supervised
-   learning parameters one could use the following code:
-
-   ```$xslt
-   from absl import app as absl_app
-   from absl import flags
-
-   from official.utils.flags import core as flags_core
-
-
-   def define_flags():
-     flags_core.define_base()
-     flags.adopt_key_flags(flags_core)
-
-
-   def main(_):
-     flags_obj = flags.FLAGS
-     print(flags_obj)
-
-
-   if __name__ == "__main__"
-     absl_app.run(main)
-   ```
-2. **Validate flag values.**
-
-   See the [Validators](#validators) section for implementation details.
-
-   Validators in the official model repo should not access the file system, such as verifying
-   that files exist, due to the strict ordering requirements.
-
-3. **Flag values should not be mutated.**
-
-   Instead of mutating flag values, use getter functions to return the desired values. An example
-   getter function is `get_tf_dtype` function below:
-
-   ```
-   # Map string to TensorFlow dtype
-   DTYPE_MAP = {
-       "fp16": tf.float16,
-       "fp32": tf.float32,
-   }
-
-   def get_tf_dtype(flags_obj):
-     if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
-       # If the graph_rewrite is used, we build the graph with fp32, and let the
-       # graph rewrite change ops to fp16.
-       return tf.float32
-     return DTYPE_MAP[flags_obj.dtype]
-
-
-   def main(_):
-     flags_obj = flags.FLAGS()
-
-     # Do not mutate flags_obj
-     # if flags_obj.fp16_implementation == "graph_rewrite":
-     #   flags_obj.dtype = "float32" # Don't do this
-
-     print(get_tf_dtype(flags_obj))
-     ...
-   ```
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/hyperparams_flags.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/hyperparams_flags.py
deleted file mode 100644
index a4f7afd15ca244b50fee306ae892c74f5107c097..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/hyperparams_flags.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Common flags for importing hyperparameters."""
-
-from absl import flags
-from official.utils.flags import core as flags_core
-
-FLAGS = flags.FLAGS
-
-
-def define_gin_flags():
-  """Define common gin configurable flags."""
-  flags.DEFINE_multi_string('gin_file', None,
-                            'List of paths to the config files.')
-  flags.DEFINE_multi_string(
-      'gin_param', None, 'Newline separated list of Gin parameter bindings.')
-
-
-def define_common_hparams_flags():
-  """Define the common flags across models."""
-
-  flags.DEFINE_string(
-      'model_dir',
-      default=None,
-      help=('The directory where the model and training/evaluation summaries'
-            'are stored.'))
-
-  flags.DEFINE_integer(
-      'train_batch_size', default=None, help='Batch size for training.')
-
-  flags.DEFINE_integer(
-      'eval_batch_size', default=None, help='Batch size for evaluation.')
-
-  flags.DEFINE_string(
-      'precision',
-      default=None,
-      help=('Precision to use; one of: {bfloat16, float32}'))
-
-  flags.DEFINE_string(
-      'config_file',
-      default=None,
-      help=('A YAML file which specifies overrides. Note that this file can be '
-            'used as an override template to override the default parameters '
-            'specified in Python. If the same parameter is specified in both '
-            '`--config_file` and `--params_override`, the one in '
-            '`--params_override` will be used finally.'))
-
-  flags.DEFINE_string(
-      'params_override',
-      default=None,
-      help=('a YAML/JSON string or a YAML file which specifies additional '
-            'overrides over the default parameters and those specified in '
-            '`--config_file`. Note that this is supposed to be used only to '
-            'override the model parameters, but not the parameters like TPU '
-            'specific flags. One canonical use case of `--config_file` and '
-            '`--params_override` is users first define a template config file '
-            'using `--config_file`, then use `--params_override` to adjust the '
-            'minimal set of tuning parameters, for example setting up different'
-            ' `train_batch_size`. '
-            'The final override order of parameters: default_model_params --> '
-            'params from config_file --> params in params_override.'
-            'See also the help message of `--config_file`.'))
-  flags.DEFINE_integer('save_checkpoint_freq', None,
-                       'Number of steps to save checkpoint.')
-
-
-def initialize_common_flags():
-  """Define the common flags across models."""
-  define_common_hparams_flags()
-
-  flags_core.define_device(tpu=True)
-  flags_core.define_base(
-      num_gpu=True, model_dir=False, data_dir=False, batch_size=False)
-  flags_core.define_distribution(worker_hosts=True, task_index=True)
-  flags_core.define_performance(all_reduce_alg=True, num_packs=True)
-
-  # Reset the default value of num_gpus to zero.
-  FLAGS.num_gpus = 0
-
-  flags.DEFINE_string(
-      'strategy_type', 'mirrored', 'Type of distribute strategy.'
-      'One of mirrored, tpu and multiworker.')
-
-
-def strategy_flags_dict():
-  """Returns TPU and/or GPU related flags in a dictionary."""
-  return {
-      'distribution_strategy': FLAGS.strategy_type,
-      # TPUStrategy related flags.
-      'tpu': FLAGS.tpu,
-      # MultiWorkerMirroredStrategy related flags.
-      'all_reduce_alg': FLAGS.all_reduce_alg,
-      'worker_hosts': FLAGS.worker_hosts,
-      'task_index': FLAGS.task_index,
-      # MirroredStrategy and OneDeviceStrategy
-      'num_gpus': FLAGS.num_gpus,
-      'num_packs': FLAGS.num_packs,
-  }
-
-
-def hparam_flags_dict():
-  """Returns model params related flags in a dictionary."""
-  return {
-      'data_dir': FLAGS.data_dir,
-      'model_dir': FLAGS.model_dir,
-      'train_batch_size': FLAGS.train_batch_size,
-      'eval_batch_size': FLAGS.eval_batch_size,
-      'precision': FLAGS.precision,
-      'config_file': FLAGS.config_file,
-      'params_override': FLAGS.params_override,
-  }
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/distribution_utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/distribution_utils.py
deleted file mode 100644
index 5042adfb1ce1e1dbf35e3801b6b4f6449e11b324..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/distribution_utils.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Helper functions for running models in a distributed setting."""
-# pylint: disable=wildcard-import
-from official.common.distribute_utils import *
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/keras_utils.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/keras_utils.py
deleted file mode 100644
index 32badce8ddebb0c001c42a63f801877424ad14ca..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/keras_utils.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Helper functions for the Keras implementations of models."""
-
-import multiprocessing
-import os
-import time
-
-from absl import logging
-import tensorflow as tf
-
-from tensorflow.python.eager import monitoring
-
-global_batch_size_gauge = monitoring.IntGauge(
-    '/tensorflow/training/global_batch_size', 'TF training global batch size')
-
-first_batch_time_gauge = monitoring.IntGauge(
-    '/tensorflow/training/first_batch',
-    'TF training start/end time for first batch (unix epoch time in us.',
-    'type')
-
-first_batch_start_time = first_batch_time_gauge.get_cell('start')
-first_batch_end_time = first_batch_time_gauge.get_cell('end')
-
-
-class BatchTimestamp(object):
-  """A structure to store batch time stamp."""
-
-  def __init__(self, batch_index, timestamp):
-    self.batch_index = batch_index
-    self.timestamp = timestamp
-
-  def __repr__(self):
-    return "'BatchTimestamp<batch_index: {}, timestamp: {}>'".format(
-        self.batch_index, self.timestamp)
-
-
-class TimeHistory(tf.keras.callbacks.Callback):
-  """Callback for Keras models."""
-
-  def __init__(self, batch_size, log_steps, initial_step=0, logdir=None):
-    """Callback for logging performance.
-
-    Args:
-      batch_size: Total batch size.
-      log_steps: Interval of steps between logging of batch level stats.
-      initial_step: Optional, initial step.
-      logdir: Optional directory to write TensorBoard summaries.
-    """
-    # TODO(wcromar): remove this parameter and rely on `logs` parameter of
-    # on_train_batch_end()
-    self.batch_size = batch_size
-    super(TimeHistory, self).__init__()
-    self.log_steps = log_steps
-    self.last_log_step = initial_step
-    self.steps_before_epoch = initial_step
-    self.steps_in_epoch = 0
-    self.start_time = None
-
-    global_batch_size_gauge.get_cell().set(batch_size)
-
-    if logdir:
-      self.summary_writer = tf.summary.create_file_writer(logdir)
-    else:
-      self.summary_writer = None
-
-    # Logs start of step 1 then end of each step based on log_steps interval.
-    self.timestamp_log = []
-
-    # Records the time each epoch takes to run from start to finish of epoch.
-    self.epoch_runtime_log = []
-
-  @property
-  def global_steps(self):
-    """The current 1-indexed global step."""
-    return self.steps_before_epoch + self.steps_in_epoch
-
-  @property
-  def average_steps_per_second(self):
-    """The average training steps per second across all epochs."""
-    return self.global_steps / sum(self.epoch_runtime_log)
-
-  @property
-  def average_examples_per_second(self):
-    """The average number of training examples per second across all epochs."""
-    return self.average_steps_per_second * self.batch_size
-
-  def get_examples_per_sec(self, warmup=1):
-    """Calculates examples/sec through timestamp_log and skip warmup period."""
-    # First entry in timestamp_log is the start of the step 1. The rest of the
-    # entries are the end of each step recorded.
-    time_log = self.timestamp_log
-    seconds = time_log[-1].timestamp - time_log[warmup].timestamp
-    steps = time_log[-1].batch_index - time_log[warmup].batch_index
-    return self.batch_size * steps / seconds
-
-  def get_startup_time(self, start_time_sec):
-    return self.timestamp_log[0].timestamp - start_time_sec
-
-  def on_train_end(self, logs=None):
-    self.train_finish_time = time.time()
-
-    if self.summary_writer:
-      self.summary_writer.flush()
-
-  def on_epoch_begin(self, epoch, logs=None):
-    self.epoch_start = time.time()
-
-  def on_batch_begin(self, batch, logs=None):
-    if not self.start_time:
-      self.start_time = time.time()
-      if not first_batch_start_time.value():
-        first_batch_start_time.set(int(self.start_time * 1000000))
-
-    # Record the timestamp of the first global step
-    if not self.timestamp_log:
-      self.timestamp_log.append(
-          BatchTimestamp(self.global_steps, self.start_time))
-
-  def on_batch_end(self, batch, logs=None):
-    """Records elapse time of the batch and calculates examples per second."""
-    if not first_batch_end_time.value():
-      first_batch_end_time.set(int(time.time() * 1000000))
-    self.steps_in_epoch = batch + 1
-    steps_since_last_log = self.global_steps - self.last_log_step
-    if steps_since_last_log >= self.log_steps:
-      now = time.time()
-      elapsed_time = now - self.start_time
-      steps_per_second = steps_since_last_log / elapsed_time
-      examples_per_second = steps_per_second * self.batch_size
-
-      self.timestamp_log.append(BatchTimestamp(self.global_steps, now))
-      logging.info(
-          'TimeHistory: %.2f seconds, %.2f examples/second between steps %d '
-          'and %d', elapsed_time, examples_per_second, self.last_log_step,
-          self.global_steps)
-
-      if self.summary_writer:
-        with self.summary_writer.as_default():
-          tf.summary.scalar('steps_per_second', steps_per_second,
-                            self.global_steps)
-          tf.summary.scalar('examples_per_second', examples_per_second,
-                            self.global_steps)
-
-      self.last_log_step = self.global_steps
-      self.start_time = None
-
-  def on_epoch_end(self, epoch, logs=None):
-    epoch_run_time = time.time() - self.epoch_start
-    self.epoch_runtime_log.append(epoch_run_time)
-
-    self.steps_before_epoch += self.steps_in_epoch
-    self.steps_in_epoch = 0
-
-
-class SimpleCheckpoint(tf.keras.callbacks.Callback):
-  """Keras callback to save tf.train.Checkpoints."""
-
-  def __init__(self, checkpoint_manager):
-    super(SimpleCheckpoint, self).__init__()
-    self.checkpoint_manager = checkpoint_manager
-
-  def on_epoch_end(self, epoch, logs=None):
-    step_counter = self.checkpoint_manager._step_counter.numpy()  # pylint: disable=protected-access
-    self.checkpoint_manager.save(checkpoint_number=step_counter)
-
-
-def set_session_config(enable_xla=False):
-  """Sets the session config."""
-  if enable_xla:
-    tf.config.optimizer.set_jit(True)
-
-
-# TODO(hongkuny): remove set_config_v2 globally.
-set_config_v2 = set_session_config
-
-
-def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads,
-                                  num_gpus, per_gpu_thread_count):
-  """Set GPU thread mode and count, and adjust dataset threads count."""
-  cpu_count = multiprocessing.cpu_count()
-  logging.info('Logical CPU cores: %s', cpu_count)
-
-  # Allocate private thread pool for each GPU to schedule and launch kernels
-  per_gpu_thread_count = per_gpu_thread_count or 2
-  os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode
-  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
-  logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT'])
-  logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
-
-  # Limit data preprocessing threadpool to CPU cores minus number of total GPU
-  # private threads and memory copy threads.
-  total_gpu_thread_count = per_gpu_thread_count * num_gpus
-  num_runtime_threads = num_gpus
-  if not datasets_num_private_threads:
-    datasets_num_private_threads = min(
-        cpu_count - total_gpu_thread_count - num_runtime_threads, num_gpus * 8)
-    logging.info('Set datasets_num_private_threads to %s',
-                 datasets_num_private_threads)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/model_helpers.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/model_helpers.py
deleted file mode 100644
index 24ac0e1478dce10eb466f01fb7081f53508eb93c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/model_helpers.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Miscellaneous functions that can be called by models."""
-
-import numbers
-
-from absl import logging
-import tensorflow as tf
-
-from tensorflow.python.util import nest
-# pylint:disable=logging-format-interpolation
-
-
-def past_stop_threshold(stop_threshold, eval_metric):
-  """Return a boolean representing whether a model should be stopped.
-
-  Args:
-    stop_threshold: float, the threshold above which a model should stop
-      training.
-    eval_metric: float, the current value of the relevant metric to check.
-
-  Returns:
-    True if training should stop, False otherwise.
-
-  Raises:
-    ValueError: if either stop_threshold or eval_metric is not a number
-  """
-  if stop_threshold is None:
-    return False
-
-  if not isinstance(stop_threshold, numbers.Number):
-    raise ValueError("Threshold for checking stop conditions must be a number.")
-  if not isinstance(eval_metric, numbers.Number):
-    raise ValueError("Eval metric being checked against stop conditions "
-                     "must be a number.")
-
-  if eval_metric >= stop_threshold:
-    logging.info("Stop threshold of {} was passed with metric value {}.".format(
-        stop_threshold, eval_metric))
-    return True
-
-  return False
-
-
-def generate_synthetic_data(input_shape,
-                            input_value=0,
-                            input_dtype=None,
-                            label_shape=None,
-                            label_value=0,
-                            label_dtype=None):
-  """Create a repeating dataset with constant values.
-
-  Args:
-    input_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
-      the input data.
-    input_value: Value of each input element.
-    input_dtype: Input dtype. If None, will be inferred by the input value.
-    label_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
-      the label data.
-    label_value: Value of each input element.
-    label_dtype: Input dtype. If None, will be inferred by the target value.
-
-  Returns:
-    Dataset of tensors or tuples of tensors (if label_shape is set).
-  """
-  # TODO(kathywu): Replace with SyntheticDataset once it is in contrib.
-  element = input_element = nest.map_structure(
-      lambda s: tf.constant(input_value, input_dtype, s), input_shape)
-
-  if label_shape:
-    label_element = nest.map_structure(
-        lambda s: tf.constant(label_value, label_dtype, s), label_shape)
-    element = (input_element, label_element)
-
-  return tf.data.Dataset.from_tensors(element).repeat()
-
-
-def apply_clean(flags_obj):
-  if flags_obj.clean and tf.io.gfile.exists(flags_obj.model_dir):
-    logging.info("--clean flag set. Removing existing model dir:"
-                 " {}".format(flags_obj.model_dir))
-    tf.io.gfile.rmtree(flags_obj.model_dir)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/model_helpers_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/model_helpers_test.py
deleted file mode 100644
index 92645b22b64e9613070cd5b7ad41f9a1739d01d4..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/misc/model_helpers_test.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for Model Helper functions."""
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.utils.misc import model_helpers
-
-
-class PastStopThresholdTest(tf.test.TestCase):
-  """Tests for past_stop_threshold."""
-
-  def setUp(self):
-    super(PastStopThresholdTest, self).setUp()
-    tf.compat.v1.disable_eager_execution()
-
-  def test_past_stop_threshold(self):
-    """Tests for normal operating conditions."""
-    self.assertTrue(model_helpers.past_stop_threshold(0.54, 1))
-    self.assertTrue(model_helpers.past_stop_threshold(54, 100))
-    self.assertFalse(model_helpers.past_stop_threshold(0.54, 0.1))
-    self.assertFalse(model_helpers.past_stop_threshold(-0.54, -1.5))
-    self.assertTrue(model_helpers.past_stop_threshold(-0.54, 0))
-    self.assertTrue(model_helpers.past_stop_threshold(0, 0))
-    self.assertTrue(model_helpers.past_stop_threshold(0.54, 0.54))
-
-  def test_past_stop_threshold_none_false(self):
-    """Tests that check None returns false."""
-    self.assertFalse(model_helpers.past_stop_threshold(None, -1.5))
-    self.assertFalse(model_helpers.past_stop_threshold(None, None))
-    self.assertFalse(model_helpers.past_stop_threshold(None, 1.5))
-    # Zero should be okay, though.
-    self.assertTrue(model_helpers.past_stop_threshold(0, 1.5))
-
-  def test_past_stop_threshold_not_number(self):
-    """Tests for error conditions."""
-    with self.assertRaises(ValueError):
-      model_helpers.past_stop_threshold('str', 1)
-
-    with self.assertRaises(ValueError):
-      model_helpers.past_stop_threshold('str', tf.constant(5))
-
-    with self.assertRaises(ValueError):
-      model_helpers.past_stop_threshold('str', 'another')
-
-    with self.assertRaises(ValueError):
-      model_helpers.past_stop_threshold(0, None)
-
-    with self.assertRaises(ValueError):
-      model_helpers.past_stop_threshold(0.7, 'str')
-
-    with self.assertRaises(ValueError):
-      model_helpers.past_stop_threshold(tf.constant(4), None)
-
-
-class SyntheticDataTest(tf.test.TestCase):
-  """Tests for generate_synthetic_data."""
-
-  def test_generate_synethetic_data(self):
-    input_element, label_element = tf.compat.v1.data.make_one_shot_iterator(
-        model_helpers.generate_synthetic_data(
-            input_shape=tf.TensorShape([5]),
-            input_value=123,
-            input_dtype=tf.float32,
-            label_shape=tf.TensorShape([]),
-            label_value=456,
-            label_dtype=tf.int32)).get_next()
-
-    with self.session() as sess:
-      for n in range(5):
-        inp, lab = sess.run((input_element, label_element))
-        self.assertAllClose(inp, [123., 123., 123., 123., 123.])
-        self.assertEquals(lab, 456)
-
-  def test_generate_only_input_data(self):
-    d = model_helpers.generate_synthetic_data(
-        input_shape=tf.TensorShape([4]),
-        input_value=43.5,
-        input_dtype=tf.float32)
-
-    element = tf.compat.v1.data.make_one_shot_iterator(d).get_next()
-    self.assertFalse(isinstance(element, tuple))
-
-    with self.session() as sess:
-      inp = sess.run(element)
-      self.assertAllClose(inp, [43.5, 43.5, 43.5, 43.5])
-
-  def test_generate_nested_data(self):
-    d = model_helpers.generate_synthetic_data(
-        input_shape={
-            'a': tf.TensorShape([2]),
-            'b': {
-                'c': tf.TensorShape([3]),
-                'd': tf.TensorShape([])
-            }
-        },
-        input_value=1.1)
-
-    element = tf.compat.v1.data.make_one_shot_iterator(d).get_next()
-    self.assertIn('a', element)
-    self.assertIn('b', element)
-    self.assertEquals(len(element['b']), 2)
-    self.assertIn('c', element['b'])
-    self.assertIn('d', element['b'])
-    self.assertNotIn('c', element)
-
-    with self.session() as sess:
-      inp = sess.run(element)
-      self.assertAllClose(inp['a'], [1.1, 1.1])
-      self.assertAllClose(inp['b']['c'], [1.1, 1.1, 1.1])
-      self.assertAllClose(inp['b']['d'], 1.1)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/__init__.py
deleted file mode 100644
index a11b1ff79e891e0fcee5bf824718e75d9103e28f..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/integration.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/integration.py
deleted file mode 100644
index 8421157441d94e73f1a85d7afbad68bcbf506d5c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/integration.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Helper code to run complete models from within python."""
-
-import os
-import shutil
-import sys
-import tempfile
-
-from absl import flags
-from absl.testing import flagsaver
-
-from official.utils.flags import core as flags_core
-
-
-@flagsaver.flagsaver
-def run_synthetic(main,
-                  tmp_root,
-                  extra_flags=None,
-                  synth=True,
-                  train_epochs=1,
-                  epochs_between_evals=1):
-  """Performs a minimal run of a model.
-
-    This function is intended to test for syntax errors throughout a model. A
-  very limited run is performed using synthetic data.
-
-  Args:
-    main: The primary function used to exercise a code path. Generally this
-      function is "<MODULE>.main(argv)".
-    tmp_root: Root path for the temp directory created by the test class.
-    extra_flags: Additional flags passed by the caller of this function.
-    synth: Use synthetic data.
-    train_epochs: Value of the --train_epochs flag.
-    epochs_between_evals: Value of the --epochs_between_evals flag.
-  """
-
-  extra_flags = [] if extra_flags is None else extra_flags
-
-  model_dir = tempfile.mkdtemp(dir=tmp_root)
-
-  args = [sys.argv[0], "--model_dir", model_dir] + extra_flags
-
-  if synth:
-    args.append("--use_synthetic_data")
-
-  if train_epochs is not None:
-    args.extend(["--train_epochs", str(train_epochs)])
-
-  if epochs_between_evals is not None:
-    args.extend(["--epochs_between_evals", str(epochs_between_evals)])
-
-  try:
-    flags_core.parse_flags(argv=args)
-    main(flags.FLAGS)
-  finally:
-    if os.path.exists(model_dir):
-      shutil.rmtree(model_dir)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/mock_task.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/mock_task.py
deleted file mode 100644
index 34cc4c85e887af73d88144d6d0f5f2a0f0e6dc57..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/mock_task.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Mock task for testing."""
-
-import dataclasses
-import numpy as np
-import tensorflow as tf
-
-from official.core import base_task
-from official.core import config_definitions as cfg
-from official.core import exp_factory
-from official.core import task_factory
-
-
-class MockModel(tf.keras.Model):
-
-  def __init__(self, network):
-    super().__init__()
-    self.network = network
-
-  def call(self, inputs):
-    outputs = self.network(inputs)
-    self.add_loss(tf.reduce_mean(outputs))
-    return outputs
-
-
-@dataclasses.dataclass
-class MockTaskConfig(cfg.TaskConfig):
-  pass
-
-
-@task_factory.register_task_cls(MockTaskConfig)
-class MockTask(base_task.Task):
-  """Mock task object for testing."""
-
-  def __init__(self, params=None, logging_dir=None, name=None):
-    super().__init__(params=params, logging_dir=logging_dir, name=name)
-
-  def build_model(self, *arg, **kwargs):
-    inputs = tf.keras.layers.Input(shape=(2,), name="random", dtype=tf.float32)
-    outputs = tf.keras.layers.Dense(
-        1, bias_initializer=tf.keras.initializers.Ones(), name="dense_0")(
-            inputs)
-    network = tf.keras.Model(inputs=inputs, outputs=outputs)
-    return MockModel(network)
-
-  def build_metrics(self, training: bool = True):
-    del training
-    return [tf.keras.metrics.Accuracy(name="acc")]
-
-  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
-    logs = super().validation_step(inputs, model, metrics)
-    logs["counter"] = tf.constant(1, dtype=tf.float32)
-    return logs
-
-  def build_inputs(self, params):
-
-    def generate_data(_):
-      x = tf.zeros(shape=(2,), dtype=tf.float32)
-      label = tf.zeros([1], dtype=tf.int32)
-      return x, label
-
-    dataset = tf.data.Dataset.range(1)
-    dataset = dataset.repeat()
-    dataset = dataset.map(
-        generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-    return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
-
-  def aggregate_logs(self, state, step_outputs):
-    if state is None:
-      state = {}
-    for key, value in step_outputs.items():
-      if key not in state:
-        state[key] = []
-      state[key].append(
-          np.concatenate([np.expand_dims(v.numpy(), axis=0) for v in value]))
-    return state
-
-  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
-    for k, v in aggregated_logs.items():
-      aggregated_logs[k] = np.sum(np.stack(v, axis=0))
-    return aggregated_logs
-
-
-@exp_factory.register_config_factory("mock")
-def mock_experiment() -> cfg.ExperimentConfig:
-  config = cfg.ExperimentConfig(
-      task=MockTaskConfig(), trainer=cfg.TrainerConfig())
-  return config
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/pylint.rcfile b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/pylint.rcfile
deleted file mode 100644
index b872802a81187b63e82ead282dd38fad1d1b5ded..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/pylint.rcfile
+++ /dev/null
@@ -1,168 +0,0 @@
-[MESSAGES CONTROL]
-disable=R,W,bad-option-value,trailing-newlines,no-name-in-module
-
-[REPORTS]
-# Tells whether to display a full report or only the messages
-reports=no
-
-# Activate the evaluation score.
-score=no
-
-[BASIC]
-
-# Regular expression matching correct argument names
-argument-rgx=^[a-z][a-z0-9_]*$
-
-# Regular expression matching correct attribute names
-attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
-
-# Regular expression matching correct class attribute names
-class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
-
-# Regular expression matching correct class names
-class-rgx=^_?[A-Z][a-zA-Z0-9]*$
-
-# Regular expression matching correct constant names
-const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
-
-# Minimum line length for functions/classes that require docstrings, shorter
-# ones are exempt.
-docstring-min-length=10
-
-# Regular expression matching correct function names
-function-rgx=^(?:(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
-
-# Good variable names which should always be accepted, separated by a comma
-good-names=main,_
-
-# Regular expression matching correct inline iteration names
-inlinevar-rgx=^[a-z][a-z0-9_]*$
-
-# Regular expression matching correct method names
-method-rgx=^(?:(?P<exempt>__[a-z0-9_]+__|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*)|(setUp|tearDown))$
-
-# Regular expression matching correct module names
-module-rgx=^(_?[a-z][a-z0-9_]*)|__init__|PRESUBMIT|PRESUBMIT_unittest$
-
-# Regular expression which should only match function or class names that do
-# not require a docstring.
-no-docstring-rgx=(__.*__|main|.*ArgParser)
-
-# Naming hint for variable names
-variable-name-hint=[a-z_][a-z0-9_]{2,30}$
-
-# Regular expression matching correct variable names
-variable-rgx=^[a-z][a-z0-9_]*$
-
-[TYPECHECK]
-
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis. It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules=absl, absl.*, official, official.*, tensorflow, tensorflow.*, LazyLoader, google, google.cloud.*
-
-
-[CLASSES]
-
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,__new__,setUp
-
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,_fields,_replace,_source,_make
-
-# This is deprecated, because it is not used anymore.
-#ignore-iface-methods=
-
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls,class_
-
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=mcs
-
-
-[DESIGN]
-
-# Argument names that match this expression will be ignored. Default to name
-# with leading underscore
-ignored-argument-names=_.*
-
-# Maximum number of arguments for function / method
-max-args=5
-
-# Maximum number of attributes for a class (see R0902).
-max-attributes=7
-
-# Maximum number of branch for function / method body
-max-branches=12
-
-# Maximum number of locals for function / method body
-max-locals=15
-
-# Maximum number of parents for a class (see R0901).
-max-parents=7
-
-# Maximum number of public methods for a class (see R0904).
-max-public-methods=20
-
-# Maximum number of return / yield for function / method body
-max-returns=6
-
-# Maximum number of statements in function / method body
-max-statements=50
-
-# Minimum number of public methods for a class (see R0903).
-min-public-methods=2
-
-
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception"
-overgeneral-exceptions=StandardError,Exception,BaseException
-
-
-[FORMAT]
-
-# Number of spaces of indent required inside a hanging or continued line.
-indent-after-paren=4
-
-# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
-# tab).
-indent-string='  '
-
-# Maximum number of characters on a single line.
-max-line-length=80
-
-# Maximum number of lines in a module
-max-module-lines=99999
-
-# List of optional constructs for which whitespace checking is disabled
-no-space-check=
-
-# Allow the body of an if to be on the same line as the test if there is no
-# else.
-single-line-if-stmt=yes
-
-# Allow URLs and comment type annotations to exceed the max line length as neither can be easily
-# split across lines.
-ignore-long-lines=^\s*(?:(# )?<?https?://\S+>?$|# type:)
-
-
-[VARIABLES]
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid to define new builtins when possible.
-additional-builtins=
-
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,_cb
-
-# A regular expression matching the name of dummy variables (i.e. expectedly
-# not used).
-dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/scripts/builds_common.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/scripts/builds_common.sh
deleted file mode 100644
index 3cf08bb510d2a8ba0b06b1d38ccd1294b159ce15..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/scripts/builds_common.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Common Bash functions used by build scripts
-
-COLOR_NC='\033[0m'
-COLOR_BOLD='\033[1m'
-COLOR_LIGHT_GRAY='\033[0;37m'
-COLOR_GREEN='\033[0;32m'
-COLOR_RED='\033[0;31m'
-
-die() {
-    # Print a message and exit with code 1.
-    #
-    # Usage: die <error_message>
-    #   e.g., die "Something bad happened."
-
-    echo $@
-    exit 1
-}
-
-num_cpus() {
-    # Get the number of CPUs
-    N_CPUS=$(grep -c ^processor /proc/cpuinfo)
-    if [[ -z ${N_CPUS} ]]; then
-        die "ERROR: Unable to determine the number of CPUs"
-    fi
-
-    echo ${N_CPUS}
-}
-
-# List files changed (i.e., added, or revised) from
-# the common ancestor of HEAD and the latest master branch.
-# Usage: get_changed_files_from_master_branch
-get_changed_files_from_master_branch() {
-    ANCESTOR=$(git merge-base HEAD master origin/master)
-    git diff ${ANCESTOR} --diff-filter=d --name-only "$@"
-}
-
-# List python files changed that still exist,
-# i.e., not removed.
-# Usage: get_py_files_to_check [--incremental]
-get_py_files_to_check() {
-    if [[ "$1" == "--incremental" ]]; then
-        get_changed_files_from_master_branch -- '*.py'
-    elif [[ -z "$1" ]]; then
-        find official/ -name '*.py'
-    else
-        die "Found unsupported args: $@ for get_py_files_to_check."
-    fi
-}
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/scripts/ci_sanity.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/scripts/ci_sanity.sh
deleted file mode 100644
index 0646c87a943c0f436f39fde8cf95ffae863b33a0..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/scripts/ci_sanity.sh
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# Sanity check script that runs tests and lint under local environment.
-# Make sure that tensorflow and pylint is installed.
-# usage: models >: ./official/utils/testing/scripts/ci_sanity.sh do_pylint --incremental
-set +x
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-source "${SCRIPT_DIR}/builds_common.sh"
-cd "$SCRIPT_DIR/../../../.."
-MODEL_ROOT="$(pwd)"
-
-export PYTHONPATH="$PYTHONPATH:${MODEL_ROOT}"
-
-# Run pylint
-do_pylint() {
-    # Usage: do_pylint [--incremental]
-    #
-    # Options:
-    #   --incremental  Performs check on only the python files changed in the
-    #                  last non-merge git commit.
-
-    # Use this list to ALLOWLIST pylint errors
-    ERROR_ALLOWLIST=""
-
-    echo "ERROR_ALLOWLIST=\"${ERROR_ALLOWLIST}\""
-
-    PYLINT_BIN="python3 -m pylint"
-
-    PYTHON_SRC_FILES=$(get_py_files_to_check $1)
-    if [[ -z ${PYTHON_SRC_FILES} ]]; then
-        echo "do_pylint found no Python files to check. Returning."
-        return 0
-    fi
-
-    PYLINTRC_FILE="official/utils/testing/pylint.rcfile"
-
-    if [[ ! -f "${PYLINTRC_FILE}" ]]; then
-        die "ERROR: Cannot find pylint rc file at ${PYLINTRC_FILE}"
-    fi
-
-    NUM_SRC_FILES=$(echo ${PYTHON_SRC_FILES} | wc -w)
-    NUM_CPUS=$(num_cpus)
-
-    echo "Running pylint on ${NUM_SRC_FILES} files with ${NUM_CPUS} "\
-    "parallel jobs..."
-    echo ""
-
-    PYLINT_START_TIME=$(date +'%s')
-    OUTPUT_FILE="$(mktemp)_pylint_output.log"
-    ERRORS_FILE="$(mktemp)_pylint_errors.log"
-    NONWL_ERRORS_FILE="$(mktemp)_pylint_nonwl_errors.log"
-
-    rm -rf ${OUTPUT_FILE}
-    rm -rf ${ERRORS_FILE}
-    rm -rf ${NONWL_ERRORS_FILE}
-    touch ${NONWL_ERRORS_FILE}
-
-    ${PYLINT_BIN} --rcfile="${PYLINTRC_FILE}" --output-format=parseable \
-        --jobs=${NUM_CPUS} ${PYTHON_SRC_FILES} > ${OUTPUT_FILE} 2>&1
-    PYLINT_END_TIME=$(date +'%s')
-
-    echo ""
-    echo "pylint took $((PYLINT_END_TIME - PYLINT_START_TIME)) s"
-    echo ""
-
-    # Report only what we care about
-    # Ref https://pylint.readthedocs.io/en/latest/technical_reference/features.html
-    # E: all errors
-    # W0311 bad-indentation
-    # W0312 mixed-indentation
-    # C0330 bad-continuation
-    # C0301 line-too-long
-    # C0326 bad-whitespace
-    # W0611 unused-import
-    # W0622 redefined-builtin
-    grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326|\[W0611|\[W0622)' ${OUTPUT_FILE} > ${ERRORS_FILE}
-
-    N_ERRORS=0
-    while read -r LINE; do
-        IS_ALLOWLISTED=0
-        for WL_REGEX in ${ERROR_ALLOWLIST}; do
-            if echo ${LINE} | grep -q "${WL_REGEX}"; then
-                echo "Found a ALLOWLISTed error:"
-                echo "  ${LINE}"
-                IS_ALLOWLISTED=1
-            fi
-        done
-
-        if [[ ${IS_ALLOWLISTED} == "0" ]]; then
-            echo "${LINE}" >> ${NONWL_ERRORS_FILE}
-            echo "" >> ${NONWL_ERRORS_FILE}
-            ((N_ERRORS++))
-        fi
-    done <${ERRORS_FILE}
-
-    echo "Raw lint output file: ${OUTPUT_FILE}"
-
-    echo ""
-    if [[ ${N_ERRORS} != 0 ]]; then
-        echo "FAIL: Found ${N_ERRORS} non-whitelited pylint errors:"
-        cat "${NONWL_ERRORS_FILE}"
-        return 1
-    else
-        echo "PASS: No non-ALLOWLISTed pylint errors were found."
-        return 0
-    fi
-}
-
-test_result=0
-
-TESTS="$@"
-
-for t in "${TESTS}"; do
-  ${t} || test_result=$?
-done
-
-exit "${test_result}"
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/scripts/presubmit.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/scripts/presubmit.sh
deleted file mode 100644
index 954d96df7f8c5f95546fb642ce6f9597f935cb3c..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/utils/testing/scripts/presubmit.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/bash
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# Presubmit script that runs tests and lint under local environment.
-# Make sure that tensorflow and pylint is installed.
-# usage: models >: ./official/utils/testing/scripts/presubmit.sh
-# usage: models >: ./official/utils/testing/scripts/presubmit.sh lint py2_test py3_test
-set +x
-
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$SCRIPT_DIR/../../../.."
-MODEL_ROOT="$(pwd)"
-
-export PYTHONPATH="$PYTHONPATH:${MODEL_ROOT}"
-
-py_test() {
-  local PY_BINARY="$1"
-  local exit_code=0
-
-  echo "===========Running Python test============"
-
-  for test_file in `find official/ -name '*test.py' -print`
-  do
-    echo "####=======Testing ${test_file}=======####"
-    ${PY_BINARY} "${test_file}"
-    _exit_code=$?
-    if [[ $_exit_code != 0 ]]; then
-      exit_code=$_exit_code
-      echo "FAIL: ${test_file}"
-    fi
-  done
-
-  return "${exit_code}"
-}
-
-py2_test() {
-  local PY_BINARY=$(which python2)
-  py_test "$PY_BINARY"
-  return $?
-}
-
-py3_test() {
-  local PY_BINARY=$(which python3)
-  py_test "$PY_BINARY"
-  return $?
-}
-
-test_result=0
-
-if [ "$#" -eq 0 ]; then
-  TESTS="lint py2_test py3_test"
-else
-  TESTS="$@"
-fi
-
-for t in "${TESTS}"; do
-  ${t} || test_result=$?
-done
-
-exit "${test_result}"
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/LICENSE b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/LICENSE
deleted file mode 100644
index 7a4a3ea2424c09fbe48d455aed1eaa94d9124835..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/LICENSE
+++ /dev/null
@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/README.md
deleted file mode 100644
index 9412860036e7540578b494631c12977ad364396e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# Orbit
-
-Orbit is a flexible, lightweight library designed to make it easy to write
-[custom training loops][custom_training] in TensorFlow 2. Orbit handles common
-model training tasks such as saving checkpoints, running model evaluations, and
-setting up summary writing, while giving users full control over implementing
-the inner training loop. It integrates with `tf.distribute` seamlessly and
-supports running on different device types (CPU, GPU, and TPU). The core code is
-intended to be easy to read and fork.
-
-[custom_training]: https://www.tensorflow.org/tutorials/distribute/custom_training
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/__init__.py
deleted file mode 100644
index b6feb8c1834c4a23c0ce0a4ee37f63c90e120287..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Defines exported symbols for the `orbit` package."""
-
-from orbit import utils
-
-from orbit.controller import Controller
-
-from orbit.runner import AbstractEvaluator
-from orbit.runner import AbstractTrainer
-
-from orbit.standard_runner import StandardEvaluator
-from orbit.standard_runner import StandardEvaluatorOptions
-from orbit.standard_runner import StandardTrainer
-from orbit.standard_runner import StandardTrainerOptions
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/controller.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/controller.py
deleted file mode 100644
index f5b991f9cc6eb22cee6dcdb764ef0e751048826b..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/controller.py
+++ /dev/null
@@ -1,498 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Provides a `Controller` class for managing the outer training loop."""
-
-import pprint
-import time
-
-from typing import Callable, Optional, Union
-
-from absl import logging
-
-from orbit import runner
-from orbit import utils
-
-import tensorflow as tf
-
-
-def _log(message: str):
-  """Logs `message` to the `info` log, and also prints to stdout."""
-  logging.info(message)
-  print(message)
-
-
-logging.ABSLLogger.register_frame_to_skip(__file__, _log.__name__)
-
-
-def _format_output(output, indent=4):
-  """Formats `output`, either on one line, or indented across multiple lines."""
-  formatted = pprint.pformat(output)
-  lines = formatted.splitlines()
-  if len(lines) == 1:
-    return formatted
-  lines = [" " * indent + line for line in lines]
-  return "\n" + "\n".join(lines)
-
-
-class Controller:
-  """Class that controls the outer loop of model training and evaluation.
-
-  Orbit divides training and evaluation into "inner" and "outer" loops. Inner
-  loops are implemented by users in the form of `AbstractTrainer` and
-  `AbstractEvaluator` subclasses, and define how to run a given number of
-  training or evaluation steps. The outer loop is provided by this `Controller`,
-  and interleaves calls to the user provided inner loops with additional actions
-  such as saving checkpoints, running evaluations, and writing summaries
-  (depending on the arguments passed to `Controller.__init__` and the method
-  being called).
-
-  There are four top-level "outer loops" provided:
-
-    - `train`, which trains until a specified number of global steps is reached;
-    - `evaluate`, for one-off model evaluation;
-    - `train_and_evaluate`, for interleaved training and evaluation;
-    - `evaluate_continuously`, for monitoring a given directory and running
-      evaluations on new model checkpoints.
-
-  While this class attempts to provide out-of-the-box solutions for common
-  training and evaluation use cases, the internal details and method
-  implementations are also intended to be simple enough to make subclassing or
-  other custom outer loop implementations easy to achieve.
-  """
-
-  def __init__(
-      self,
-      *,  # Makes all args keyword only.
-      global_step: tf.Variable,
-      trainer: Optional[runner.AbstractTrainer] = None,
-      evaluator: Optional[runner.AbstractEvaluator] = None,
-      strategy: Optional[tf.distribute.Strategy] = None,
-      # Train related
-      steps_per_loop: Optional[int] = None,
-      checkpoint_manager: Optional[tf.train.CheckpointManager] = None,
-      # Summary related
-      summary_interval: Optional[int] = None,
-      summary_dir: Optional[str] = None,
-      # Evaluation related
-      eval_summary_dir: Optional[str] = None):
-    """Initializes a `Controller` instance.
-
-    Note that if `checkpoint_manager` is provided and there are checkpoints in
-    the associated model directory, the model will be restored from the most
-    recent checkpoint during this `__init__` method.
-
-    Args:
-      global_step: An integer `tf.Variable` storing the global training step
-        number. Usually this can be obtained from the `iterations` property of
-        the model's optimizer (e.g. `trainer.optimizer.iterations`). In cases
-        where multiple optimizers are used, or if one model "step" corresponds
-        to more than one update to model parameters, users can create and
-        increment their own global step variable as well. In this case it is
-        recommended to create the `tf.Variable` inside the distribution strategy
-        scope, with `aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA` (see
-        also `orbit.utils.create_global_step()`).
-      trainer: An instance of `orbit.AbstractTrainer`, which implements the
-        inner training loop.
-      evaluator: An instance of `orbit.AbstractEvaluator`, which implements
-        evaluation.
-      strategy: An instance of `tf.distribute.Strategy`. If not provided, the
-        strategy will be initialized from the current in-scope strategy using
-        `tf.distribute.get_strategy()`.
-      steps_per_loop: The number of steps to run in each inner loop of training
-        (passed as the `num_steps` parameter of `trainer.train`).
-      checkpoint_manager: An instance of `tf.train.CheckpointManager`. If
-        provided and there are checkpoints in the associated model directory,
-        the model will be restored from the most recent checkpoint inside this
-        `__init__` method. If not provided, the `Controller` will not
-        automatically save to or restore from checkpoints.
-      summary_interval: Step interval for training summaries. Note that this
-        argument only applies to `tf.summary` calls inside the `trainer.train`
-        function. Summaries written by the `Controller` (specifically
-        "steps_per_second" and output from the `trainer.train` method) will
-        always be enabled unless the `summary_dir` parameter is `None`. If set,
-        the value must be divisible by `steps_per_loop`.
-      summary_dir: The directory to write summaries to. To use the same
-        directory as for checkpointing, pass `checkpoint_manager.directory`. If
-        `None`, no training summaries will be written.
-      eval_summary_dir: The directory to write eval summaries to. If `None`, it
-        will be set to `summary_dir`. If both `summary_dir` and
-        `eval_summary_dir` are `None`, no eval summaries will be written.
-
-    Raises:
-      ValueError: If both `trainer` and `evaluator` are `None`.
-      ValueError: If `steps_per_loop` is not a positive integer.
-      ValueError: If `summary_interval` is not a positive integer or is not
-        divisible by `steps_per_loop`.
-    """
-    if trainer is None and evaluator is None:
-      raise ValueError("`trainer` and `evaluator` should not both be `None`.")
-    if trainer is not None:
-      if steps_per_loop is None:
-        raise ValueError(
-            "`steps_per_loop` is required when `trainer` is provided.")
-      elif not isinstance(steps_per_loop, int) or steps_per_loop < 1:
-        raise ValueError(
-            f"`steps_per_loop` ({steps_per_loop}) must be a positive integer.")
-
-      if summary_interval is not None:
-        if summary_interval <= 0:
-          raise ValueError(
-              f"`summary_interval` ({summary_interval}) must be larger than 0.")
-        elif summary_interval % steps_per_loop != 0:
-          raise ValueError(
-              f"`summary interval` ({summary_interval}) must be a multiple "
-              f"of `steps_per_loop` ({steps_per_loop}).")
-
-    if not isinstance(global_step, tf.Variable):
-      raise ValueError("`global_step` must be a `tf.Variable`.")
-
-    self.trainer = trainer
-    self.evaluator = evaluator
-
-    self.strategy = strategy or tf.distribute.get_strategy()
-
-    self.global_step = global_step
-    self.checkpoint_manager = checkpoint_manager
-
-    if self.trainer is not None:
-      self.step_timer = None
-      self.steps_per_loop = steps_per_loop
-      self.summary_interval = summary_interval
-      self.summary_manager = utils.SummaryManager(
-          summary_dir, tf.summary.scalar, global_step=self.global_step)
-
-    if self.evaluator is not None:
-      eval_summary_dir = eval_summary_dir or summary_dir
-      if eval_summary_dir == summary_dir and self.trainer is not None:
-        # Reuse the summary writer if train and evaluation summary directory
-        # are the same.
-        self.eval_summary_manager = self.summary_manager
-      else:
-        self.eval_summary_manager = utils.SummaryManager(
-            eval_summary_dir, tf.summary.scalar, global_step=self.global_step)
-
-    tf.summary.experimental.set_step(self.global_step)
-
-    # Restores the model if needed.
-    if self.checkpoint_manager is not None:
-      restored_path = self.restore_checkpoint()
-      if restored_path:
-        _log(f"restored from checkpoint: {restored_path}")
-
-  def train(self, steps: int, checkpoint_at_completion: bool = True):
-    """Runs training until the specified global step count has been reached.
-
-    This method makes calls to `self.trainer.train()` until the global step
-    count is equal to `steps`. It will additionally save checkpoints (if a
-    `CheckpointManager` was passed to `Controller.__init__`) and summarize
-    training output (if `summary_dir` is set).
-
-    Args:
-      steps: The global step count to train up to.
-      checkpoint_at_completion: Whether to save a checkpoint when this method
-        returns (regardless of the checkpointing interval). Defaults to `True`.
-    """
-    self._require("trainer", for_method="train")
-
-    # TODO(momernick): Support steps=None or -1 (training to exhaustion).
-    current_step = self.global_step.numpy()  # Cache, since this is expensive.
-    _log(f"train | step: {current_step: 6d} | training until step {steps}...")
-    while current_step < steps:
-      # Calculates steps to run for the next train loop.
-      num_steps = min(steps - current_step, self.steps_per_loop)
-      self._train_n_steps(num_steps)
-      self._maybe_save_checkpoint()
-      current_step = self.global_step.numpy()
-
-    if checkpoint_at_completion:
-      self._maybe_save_checkpoint(check_interval=False)
-
-  def evaluate(self, steps: int = -1) -> Optional[runner.Output]:
-    """Runs evaluation for the given number of steps.
-
-    This method calls `self.evaluator.evaluate(steps)`, then writes the returned
-    summaries (if any).
-
-    Args:
-      steps: The number of evaluation steps to run. The value `-1` is reserved
-        as a special sentinel to indicate a "complete" evaluation that runs
-        until the underlying dataset is exhausted. Support for this is dependent
-        on the specific `evaluator` being used.
-
-    Returns:
-      The evaluation results as a dictionary mapping names to NumPy values.
-
-    Raises:
-      ValueError: If `evaluator` was not provided to `Controller.__init__`.
-      ValueError: If no checkpoint is present in `checkpoint_manager.directory`.
-      ValueError: If `steps` is not a positive value or -1.
-    """
-    self._require("evaluator", for_method="evaluate")
-
-    if steps > 0:
-      steps_msg = f"running {steps} steps of evaluation..."
-    elif steps == -1:
-      steps_msg = "running complete evaluation..."
-    else:
-      raise ValueError(f"`steps` ({steps}) should be > 0, or == -1.")
-
-    current_step = self.global_step.numpy()
-    _log(f" eval | step: {current_step: 6d} | {steps_msg}")
-
-    start = time.time()
-    with self.eval_summary_manager.summary_writer().as_default():
-      steps_tensor = tf.convert_to_tensor(steps, dtype=tf.int32)
-      eval_output = self.evaluator.evaluate(steps_tensor)
-    eval_output = tf.nest.map_structure(utils.get_value, eval_output or {})
-    elapsed = time.time() - start
-
-    _log(f" eval | step: {current_step: 6d} | "
-         f"eval time: {elapsed: 6.1f} sec | "
-         f"output: {_format_output(eval_output)}")
-
-    self.eval_summary_manager.write_summaries(eval_output)
-    self.eval_summary_manager.flush()
-
-    return eval_output
-
-  def train_and_evaluate(self,
-                         train_steps: int,
-                         eval_steps: int = -1,
-                         eval_interval: Optional[int] = None) -> None:
-    """Runs interleaved training and evaluation.
-
-    This method interleaves calls to `self.train()` and `self.evaluate()`,
-    training the model until the global step count equals `train_steps`, and
-    running an evaluation for `eval_steps` every `eval_interval` training steps.
-    In addition, this method will run a final evaluation at the end of the
-    training sequence.
-
-    Args:
-      train_steps: The global step count to train up to.
-      eval_steps: The number of steps to run during an evaluation. If -1, this
-        method will evaluate over the entire evaluation dataset.
-      eval_interval: The number of training steps to run between evaluations. If
-        set, training will always stop every `eval_interval` steps, even if this
-        results in a shorter inner loop than specified by `steps_per_loop`
-        setting. If None, evaluation will only be performed after training is
-        complete.
-
-    Raises:
-      ValueError: If eval_interval is not a multiple of self.steps_per_loop.
-    """
-    self._require("trainer", for_method="train_and_evaluate")
-    self._require("evaluator", for_method="train_and_evaluate")
-
-    current_step = self.global_step.numpy()  # Cache, since this is expensive.
-    eval_interval = eval_interval or (train_steps - current_step)
-    while current_step < train_steps:
-      interval = min(train_steps - current_step, eval_interval)
-      num_steps = current_step + interval
-      self.train(steps=num_steps, checkpoint_at_completion=False)
-      self.evaluate(steps=eval_steps)
-      current_step = self.global_step.numpy()
-    self._maybe_save_checkpoint(check_interval=False)
-
-  def evaluate_continuously(self,
-                            steps: int = -1,
-                            timeout: Optional[Union[int, float]] = None,
-                            timeout_fn: Optional[Callable[[], bool]] = None):
-    """Continuously monitors a directory and evaluates new checkpoints in it.
-
-    This method continuously monitors a directory as specified by this
-    Controller's CheckpointManager init arg and runs evaluation on the
-    checkpoints found there.
-
-    Args:
-      steps: The number of steps to run when evaluating. If -1, this method will
-        evaluate over the entire evaluation dataset.
-      timeout: The maximum number of seconds to wait between checkpoints. See
-        tf.train.checkpoints_iterator documentation.
-      timeout_fn: Optional callable to call after a timeout. If the function
-        returns True, then it means that no new checkpoints will be generated
-        and the iterator will exit.
-
-    Raises:
-      ValueError: If no checkpoint found in `self.checkpoint_manager.directory`.
-      ValueError: If `evaluator` was not provided as a controller init arg.
-    """
-    self._require("evaluator", for_method="evaluate_continuously")
-    self._require("checkpoint_manager", for_method="evaluate_continuously")
-
-    for checkpoint_path in tf.train.checkpoints_iterator(
-        self.checkpoint_manager.directory,
-        timeout=timeout,
-        timeout_fn=timeout_fn):
-      self.restore_checkpoint(checkpoint_path)
-      self.evaluate(steps)
-
-  def restore_checkpoint(self, checkpoint_path: str = None):
-    """Restores the model from a checkpoint.
-
-    Args:
-      checkpoint_path: An optional string specifying the checkpoint path to
-        restore from. If `None`, will restore from the most recent checkpoint
-        (or initialize the model using a custom `init_fn` if no checkpoints can
-        be found) using `self.checkpoint_manager.restore_or_initialize()`.
-
-    Returns:
-      The path to the restored checkpoint if a restore happened, or `None` if no
-      restore occurred.
-    """
-    self._require("checkpoint_manager", for_method="restore_checkpoint")
-
-    with self.strategy.scope():
-      # Checkpoint restoring should be inside scope (b/139450638).
-      if checkpoint_path is not None:
-        _log(f"restoring model from {checkpoint_path}...")
-        self.checkpoint_manager.checkpoint.restore(checkpoint_path)
-      else:
-        _log("restoring or initializing model...")
-        checkpoint_path = self.checkpoint_manager.restore_or_initialize()
-
-    if checkpoint_path is not None:
-      _log(f"restored model from {checkpoint_path}.")
-    else:
-      _log("initialized model.")
-
-    return checkpoint_path
-
-  def save_checkpoint(self):
-    """Saves the model to a checkpoint.
-
-    This method will save a checkpoint containing the current state of the
-    model.
-
-    Raises:
-      ValueError: If no `checkpoint_manager` was provided to
-        `Controller.__init__`.
-    """
-    self._require("checkpoint_manager", for_method="save_checkpoint")
-    self._maybe_save_checkpoint(check_interval=False)
-
-  def _train_n_steps(self, num_steps: int):
-    """Runs training for `num_steps` steps.
-
-    Also prints/logs updates about training progress, and summarizes training
-    output (if output is returned from `self.trainer.train()`, and if
-    `self.summary_dir` is set).
-
-    Args:
-      num_steps: An integer specifying how many steps of training to run.
-
-    Raises:
-      RuntimeError: If `global_step` is not properly incremented by `num_steps`
-        after calling `self.trainer.train(num_steps)`.
-    """
-    if not self.step_timer:
-      self.step_timer = StepTimer(self.global_step)
-    current_step = self.global_step.numpy()
-
-    with self.summary_manager.summary_writer().as_default():
-      should_record = False  # Allows static optimization in no-summary cases.
-      if self.summary_interval:
-        # Create a predicate to determine when summaries should be written.
-        should_record = lambda: (self.global_step % self.summary_interval == 0)
-      with tf.summary.record_if(should_record):
-        num_steps_tensor = tf.convert_to_tensor(num_steps, dtype=tf.int32)
-        train_output = self.trainer.train(num_steps_tensor)
-    train_output = tf.nest.map_structure(utils.get_value, train_output or {})
-
-    # Verify that global_step was updated properly, then update current_step.
-    expected_step = current_step + num_steps
-    if self.global_step.numpy() != expected_step:
-      message = (
-          f"`trainer.train({num_steps})` did not update `global_step` by "
-          f"{num_steps}. Old value was {current_step}, expected updated value "
-          f"to be {expected_step}, but it was {self.global_step.numpy()}.")
-      logging.warning(message)
-      return
-
-    current_step = expected_step
-    steps_per_second = self.step_timer.steps_per_second()
-    _log(f"train | step: {current_step: 6d} | "
-         f"steps/sec: {steps_per_second: 6.1f} | "
-         f"output: {_format_output(train_output)}")
-
-    train_output["steps_per_second"] = steps_per_second
-    self.summary_manager.write_summaries(train_output)
-    self.summary_manager.flush()
-
-  def _maybe_save_checkpoint(self, check_interval: bool = True):
-    """Conditionally saves a checkpoint.
-
-    A checkpoint is saved if a `CheckpointManager` is available, and if the
-    required number of steps has elapsed since the last checkpoint was saved
-    (although this condition can be disabled by setting `check_interval=False`).
-
-    Args:
-      check_interval: Whether to check if the checkpoint interval has fully
-        elapsed. If `False`, a checkpoint is saved regardless of the elapsed
-        steps since the most recent checkpoint, unless no `checkpoint_manager`
-        was provided to `Controller.__init__`.
-
-    Returns:
-      A boolean indicating whether a checkpoint was saved.
-    """
-    if self.checkpoint_manager and self.checkpoint_manager.checkpoint_interval:
-      ckpt_path = self.checkpoint_manager.save(
-          checkpoint_number=self.global_step.numpy(),
-          check_interval=check_interval)
-      if ckpt_path is not None:
-        _log(f"saved checkpoint to {ckpt_path}.")
-        return True
-    return False
-
-  def _require(self, attribute, for_method):
-    """Utility method to raise an error if the given `attribute` is not set."""
-    if getattr(self, attribute, None) is None:
-      raise ValueError(
-          f"`{attribute}` is not set. Pass `{attribute}` to "
-          f"`Controller.__init__` before calling `{for_method}()`.")
-
-
-class StepTimer:
-  """Utility class for measuring steps/second."""
-
-  def __init__(self, step):
-    self.step = step
-    self.start()
-
-  def start(self):
-    self.last_iteration = self.step.numpy()
-    self.last_time = time.time()
-
-  def steps_per_second(self, restart=True):
-    value = ((self.step.numpy() - self.last_iteration) /
-             (time.time() - self.last_time))
-    if restart:
-      self.start()
-    return value
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/controller_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/controller_test.py
deleted file mode 100644
index 1ccf9522b931b6abd61e20d3103a2c51e50a45c8..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/controller_test.py
+++ /dev/null
@@ -1,744 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for orbit.controller."""
-
-import os
-
-from absl import logging
-from absl.testing import parameterized
-
-import numpy as np
-
-from orbit import controller
-from orbit import runner
-from orbit import standard_runner
-
-import tensorflow as tf
-
-
-def create_model():
-  x = tf.keras.layers.Input(shape=(3,), name="input")
-  y = tf.keras.layers.Dense(4, name="dense")(x)
-  model = tf.keras.Model(x, y)
-  return model
-
-
-def summaries_with_matching_keyword(keyword, summary_dir):
-  """Returns summary protos matching given keyword from event file."""
-  matches = []
-  event_paths = tf.io.gfile.glob(os.path.join(summary_dir, "events*"))
-  for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
-    if event.summary is not None:
-      for value in event.summary.value:
-        if keyword in value.tag:
-          matches.append(event.summary)
-  return matches
-
-
-def dataset_fn(ctx):
-  del ctx
-  inputs = np.zeros((10, 3), dtype=np.float32)
-  targets = np.ones((10, 4), dtype=np.float32)
-  dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-  dataset = dataset.repeat(100)
-  dataset = dataset.batch(10, drop_remainder=True)
-  return dataset
-
-
-class TestRunner(standard_runner.StandardTrainer,
-                 standard_runner.StandardEvaluator):
-  """Implements the training and evaluation APIs for the test model."""
-
-  def __init__(self, return_numpy=False):
-    self.strategy = tf.distribute.get_strategy()
-    self.model = create_model()
-    self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
-    self.global_step = self.optimizer.iterations
-    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
-    self.eval_loss = tf.keras.metrics.Mean("eval_loss", dtype=tf.float32)
-    self.return_numpy = return_numpy
-    train_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
-    eval_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
-    standard_runner.StandardTrainer.__init__(self, train_dataset)
-    standard_runner.StandardEvaluator.__init__(self, eval_dataset)
-
-  def train_step(self, iterator):
-
-    def _replicated_step(inputs):
-      """Replicated training step."""
-      inputs, targets = inputs
-      with tf.GradientTape() as tape:
-        outputs = self.model(inputs)
-        loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
-      grads = tape.gradient(loss, self.model.variables)
-      self.optimizer.apply_gradients(zip(grads, self.model.variables))
-      self.train_loss.update_state(loss)
-
-    self.strategy.run(_replicated_step, args=(next(iterator),))
-
-  def train_loop_end(self):
-    train_loss = self.train_loss.result()
-    return {
-        "loss": train_loss.numpy() if self.return_numpy else train_loss,
-    }
-
-  def build_eval_dataset(self):
-    return self.strategy.distribute_datasets_from_function(dataset_fn)
-
-  def eval_begin(self):
-    self.eval_loss.reset_states()
-
-  def eval_step(self, iterator):
-
-    def _replicated_step(inputs):
-      """Replicated evaluation step."""
-      inputs, targets = inputs
-      outputs = self.model(inputs)
-      loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
-      self.eval_loss.update_state(loss)
-
-    self.strategy.run(_replicated_step, args=(next(iterator),))
-
-  def eval_end(self):
-    eval_loss = self.eval_loss.result()
-    return {
-        "eval_loss": eval_loss.numpy() if self.return_numpy else eval_loss,
-    }
-
-
-class TestEvaluator(standard_runner.StandardEvaluator):
-  """Implements the training and evaluation APIs for the test model."""
-
-  def __init__(self):
-    self.strategy = tf.distribute.get_strategy()
-    self.model = create_model()
-    eval_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
-    standard_runner.StandardEvaluator.__init__(self, eval_dataset)
-
-  def eval_reduce(self, state, output):
-    state.append(output)
-    return state
-
-  def eval_begin(self):
-    return []
-
-  def eval_step(self, iterator):
-
-    def _replicated_step(inputs):
-      """Replicated evaluation step."""
-      inputs, targets = inputs
-      outputs = self.model(inputs)
-      loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
-      return loss
-
-    per_replica_losses = self.strategy.run(
-        _replicated_step, args=(next(iterator),))
-    mean_loss = self.strategy.reduce(
-        tf.distribute.ReduceOp.MEAN, per_replica_losses, axis=None)
-    return mean_loss
-
-  def eval_end(self, outputs):
-    return {
-        "eval_loss": tf.reduce_mean(outputs),
-    }
-
-
-class TestEvaluatorNoOutput(runner.AbstractEvaluator):
-
-  def evaluate(self, num_steps):
-    pass
-
-
-class TestEvaluatorWithNestedSummary(standard_runner.StandardEvaluator):
-  """Implements the training and evaluation APIs for the test model."""
-
-  def __init__(self):
-    self.strategy = tf.distribute.get_strategy()
-    self.model = create_model()
-    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
-    dataset2 = self.strategy.distribute_datasets_from_function(dataset_fn)
-    self.loss = tf.keras.metrics.Mean("loss", dtype=tf.float32)
-    self.accuracy = tf.keras.metrics.CategoricalAccuracy(
-        "accuracy", dtype=tf.float32)
-    self.loss2 = tf.keras.metrics.Mean("loss", dtype=tf.float32)
-    self.accuracy2 = tf.keras.metrics.CategoricalAccuracy(
-        "accuracy", dtype=tf.float32)
-    standard_runner.StandardEvaluator.__init__(
-        self, eval_dataset={
-            "dataset": dataset,
-            "dataset2": dataset2
-        })
-
-  def eval_step(self, iterator):
-
-    def _replicated_step(loss, accuracy, inputs):
-      """Replicated evaluation step."""
-      inputs, targets = inputs
-      outputs = self.model(inputs)
-      loss.update_state(tf.keras.losses.MSE(targets, outputs))
-      accuracy.update_state(targets, outputs)
-
-    self.strategy.run(
-        lambda inputs: _replicated_step(self.loss, self.accuracy, inputs),
-        args=(next(iterator["dataset"]),))
-    self.strategy.run(
-        lambda inputs: _replicated_step(self.loss2, self.accuracy2, inputs),
-        args=(next(iterator["dataset2"]),))
-
-  def eval_end(self):
-    return {
-        "dataset": {
-            "loss": self.loss.result(),
-            "accuracy": self.accuracy.result()
-        },
-        "dataset2": {
-            "loss": self.loss2.result(),
-            "accuracy": self.accuracy2.result()
-        },
-    }
-
-
-class TestTrainerWithSummaries(standard_runner.StandardTrainer):
-  """A Trainer model with summaries for testing purposes."""
-
-  def __init__(self):
-    self.strategy = tf.distribute.get_strategy()
-    self.model = create_model()
-    self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
-    self.global_step = self.optimizer.iterations
-    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
-    train_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
-    standard_runner.StandardTrainer.__init__(
-        self,
-        train_dataset,
-        options=standard_runner.StandardTrainerOptions(
-            use_tpu_summary_optimization=True))
-
-  def build_train_dataset(self):
-    return self.strategy.distribute_datasets_from_function(dataset_fn)
-
-  def train_step(self, iterator):
-
-    def _replicated_step(inputs):
-      """Replicated training step."""
-      inputs, targets = inputs
-      with tf.GradientTape() as tape:
-        outputs = self.model(inputs)
-        loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
-      tf.summary.scalar("loss", loss)
-      grads = tape.gradient(loss, self.model.variables)
-      self.optimizer.apply_gradients(zip(grads, self.model.variables))
-      self.train_loss.update_state(loss)
-
-    self.strategy.run(_replicated_step, args=(next(iterator),))
-
-
-class ControllerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self.model_dir = self.get_temp_dir()
-
-  def test_no_checkpoint(self):
-    test_runner = TestRunner()
-    # No checkpoint manager and no strategy.
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
-    test_controller.train_and_evaluate(
-        train_steps=10, eval_steps=2, eval_interval=6)
-    self.assertEqual(test_runner.global_step, 10)
-    # Loss and accuracy values should be written into summaries.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-    # No checkpoint, so global step starts from 0.
-    test_runner.global_step.assign(0)
-    test_controller.train_and_evaluate(
-        train_steps=10, eval_steps=2, eval_interval=6)
-    self.assertEqual(test_runner.global_step, 10)
-
-  def test_no_checkpoint_and_summaries(self):
-    test_runner = TestRunner()
-    # No checkpoint + summary directories.
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        steps_per_loop=2)
-    test_controller.train_and_evaluate(
-        train_steps=10, eval_steps=2, eval_interval=6)
-    self.assertEqual(test_runner.global_step, 10)
-
-  def test_has_checkpoint_no_summaries(self):
-    test_runner = TestRunner()
-    # Has checkpoint, but no summary directories.
-    checkpoint = tf.train.Checkpoint(model=test_runner.model)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step)
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        checkpoint_manager=checkpoint_manager,
-        steps_per_loop=2)
-    test_controller.train_and_evaluate(
-        train_steps=10, eval_steps=2, eval_interval=6)
-    self.assertEqual(test_runner.global_step, 10)
-
-    # No summaries are saved.
-    self.assertEmpty(tf.io.gfile.glob(
-        os.path.join(checkpoint_manager.directory, "events.*")))
-
-  def test_has_checkpoint_eval_summary_only(self):
-    test_runner = TestRunner()
-    # Has checkpoint, but no summary directories.
-    checkpoint = tf.train.Checkpoint(model=test_runner.model)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step)
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-        steps_per_loop=2)
-    test_controller.train_and_evaluate(
-        train_steps=10, eval_steps=2, eval_interval=6)
-    self.assertEqual(test_runner.global_step, 10)
-
-    # Training summaries are not saved.
-    self.assertEmpty(tf.io.gfile.glob(
-        os.path.join(checkpoint_manager.directory, "events.*")))
-    # Evaluation summaries are saved.
-    self.assertNotEmpty(tf.io.gfile.glob(
-        os.path.join(self.model_dir, "summaries/eval/events.*")))
-
-  def test_restore_from_most_recent_checkpoint(self):
-    test_runner = TestRunner()
-    checkpoint = tf.train.Checkpoint(model=test_runner.model)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step,
-        checkpoint_interval=5)
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        global_step=test_runner.global_step,
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-        steps_per_loop=5)
-    test_controller.train(20)
-    self.assertLen(checkpoint_manager.checkpoints, 4)
-    restored_path = test_controller.restore_checkpoint()
-    self.assertEqual(restored_path, checkpoint_manager.checkpoints[-1])
-
-  @parameterized.named_parameters(("return_numpy", True),
-                                  ("return_tensor", False))
-  def test_train_and_evaluate(self, return_numpy):
-    test_runner = TestRunner(return_numpy=return_numpy)
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runner.model, optimizer=test_runner.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step,
-        checkpoint_interval=10)
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
-    test_controller.train_and_evaluate(
-        train_steps=10, eval_steps=2, eval_interval=6)
-
-    # Checkpoints are saved.
-    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
-
-    # Loss and accuracy values should be written into summaries.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-
-  def test_train_only(self):
-    test_runner = TestRunner()
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runner.model, optimizer=test_runner.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step,
-        checkpoint_interval=10)
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        global_step=test_runner.global_step,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-    )
-    test_controller.train(steps=10)
-
-    # Checkpoints are saved.
-    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
-
-    # Only train summaries are written.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertFalse(
-        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/eval")))
-
-  def test_evaluate_only(self):
-    test_runner = TestRunner()
-
-    checkpoint = tf.train.Checkpoint(model=test_runner.model)
-    checkpoint.save(os.path.join(self.model_dir, "ckpt"))
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step)
-    test_controller = controller.Controller(
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        checkpoint_manager=checkpoint_manager,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
-    eval_results = test_controller.evaluate(steps=2)
-
-    # Only eval summaries are written
-    self.assertFalse(
-        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-    self.assertIn("eval_loss", eval_results)
-
-    # Tests continuous eval with timeout and timeout_fn.
-    done_file = os.path.join(self.model_dir, "summaries/eval/Done")
-
-    def timeout_fn():
-      with tf.io.gfile.GFile(done_file, "w") as f:
-        f.write("DONE")
-        return True
-
-    test_controller = controller.Controller(
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
-    test_controller.evaluate_continuously(
-        timeout=1, timeout_fn=timeout_fn, steps=2)
-    self.assertNotEmpty(tf.io.gfile.glob(done_file))
-
-  def test_no_eval_steps(self):
-    test_runner = TestRunner()
-
-    checkpoint = tf.train.Checkpoint(model=test_runner.model)
-    checkpoint.save(os.path.join(self.model_dir, "ckpt"))
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step)
-    test_controller = controller.Controller(
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        checkpoint_manager=checkpoint_manager)
-    test_controller.evaluate()
-
-  def test_already_trained_model(self):
-    test_runner = TestRunner()
-    test_runner.global_step.assign(10)
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runner.model, optimizer=test_runner.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step,
-        checkpoint_interval=10)
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        global_step=test_runner.global_step,
-        steps_per_loop=2,
-        checkpoint_manager=checkpoint_manager)
-    # `global_step` is already `train_steps`.
-    test_controller.train(steps=10)
-
-  def test_summaries_inside_train_fn(self):
-    test_runner = TestTrainerWithSummaries()
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runner.model, optimizer=test_runner.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step)
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        global_step=test_runner.global_step,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        summary_interval=2,
-        checkpoint_manager=checkpoint_manager,
-    )
-    test_controller.train(steps=10)
-
-    # Checkpoints are saved.
-    self.assertEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
-
-    # Only train summaries are written.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertFalse(
-        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/eval")))
-
-  def test_train_and_evaluate_with_same_summary_dir(self):
-    test_runner = TestRunner()
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runner.model, optimizer=test_runner.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step)
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries"),
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries"))
-    test_controller.train_and_evaluate(
-        train_steps=10, eval_steps=2, eval_interval=6)
-
-    # Loss and accuracy values should be written into summaries.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "loss", os.path.join(self.model_dir, "summaries")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries")))
-
-  def test_early_stop_on_eval_loss(self):
-    test_runner = TestRunner()
-
-    class EarlyStopController(controller.Controller):
-      """A subclass of Controller supports early stopping."""
-
-      def train_and_evaluate(self,
-                             train_steps: int = None,
-                             eval_steps: int = None,
-                             eval_interval: int = None):
-        while self.global_step.numpy() < train_steps:
-          interval = min(train_steps - self.global_step.numpy(), eval_interval)
-          num_steps = self.global_step.numpy() + interval
-          self.train(steps=num_steps, checkpoint_at_completion=False)
-          self.evaluate(steps=eval_steps)
-          # Early stop condition.
-          if test_runner.eval_loss.result() < 0.1:
-            logging.info(
-                "Training early stopped as eval_loss %s is less than 0.1",
-                test_runner.eval_loss.result())
-            return
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runner.model, optimizer=test_runner.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step,
-        checkpoint_interval=10)
-    test_controller = EarlyStopController(
-        trainer=test_runner,
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        steps_per_loop=2,
-        checkpoint_manager=checkpoint_manager)
-    test_controller.train_and_evaluate(
-        train_steps=10, eval_steps=6, eval_interval=2)
-
-    self.assertLess(test_runner.global_step, 10)
-
-  def test_evaluate_with_loss_output(self):
-    test_evaluator = TestEvaluator()
-
-    checkpoint = tf.train.Checkpoint(model=test_evaluator.model)
-    checkpoint.save(os.path.join(self.model_dir, "ckpt"))
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint, self.model_dir, max_to_keep=None)
-    test_controller = controller.Controller(
-        evaluator=test_evaluator,
-        global_step=tf.Variable(0, dtype=tf.int64),
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
-    test_controller.evaluate(steps=5)
-
-    # Only eval summaries are written
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-
-  def test_evaluate_with_no_output(self):
-    test_controller = controller.Controller(
-        evaluator=TestEvaluatorNoOutput(),
-        global_step=tf.Variable(0, dtype=tf.int64),
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
-    self.assertEqual(test_controller.evaluate(steps=5), {})
-
-  def test_train_and_evaluate_reset_datasets(self):
-    test_runner = TestRunner()
-
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        steps_per_loop=2)
-
-    test_controller.train_and_evaluate(
-        train_steps=10, eval_steps=2, eval_interval=6)
-
-    train_dataset = (
-        test_runner.strategy.distribute_datasets_from_function(dataset_fn))
-    eval_dataset = (
-        test_runner.strategy.distribute_datasets_from_function(dataset_fn))
-    test_runner.train_dataset = train_dataset
-    test_runner.eval_dataset = eval_dataset
-
-    test_controller.train_and_evaluate(
-        train_steps=10, eval_steps=2, eval_interval=6)
-
-  def test_eval_and_checkpoint_interval(self):
-    test_runner = TestRunner()
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runner.model, optimizer=test_runner.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runner.global_step,
-        checkpoint_interval=5)
-    test_controller = controller.Controller(
-        trainer=test_runner,
-        evaluator=test_runner,
-        global_step=test_runner.global_step,
-        steps_per_loop=10,
-        checkpoint_manager=checkpoint_manager,
-        summary_dir=self.model_dir)
-    test_controller.train_and_evaluate(
-        train_steps=10, eval_steps=2, eval_interval=5)
-
-    # Expect 3 checkpoints to be saved at step: 5, 10.
-    self.assertLen(
-        tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt-*.data*")), 2)
-    # Expect evaluation is performed 2 times at step: 5, 10.
-    self.assertLen(
-        summaries_with_matching_keyword("eval_loss", self.model_dir), 2)
-
-  def test_evaluate_with_nested_summaries(self):
-    test_evaluator = TestEvaluatorWithNestedSummary()
-    test_controller = controller.Controller(
-        evaluator=test_evaluator,
-        global_step=tf.Variable(0, dtype=tf.int64),
-        eval_summary_dir=self.model_dir)
-    test_controller.evaluate(steps=5)
-
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "dataset")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "loss", os.path.join(self.model_dir, "dataset")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "accuracy", os.path.join(self.model_dir, "dataset")))
-
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "dataset2")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "loss", os.path.join(self.model_dir, "dataset2")))
-    self.assertNotEmpty(
-        summaries_with_matching_keyword(
-            "accuracy", os.path.join(self.model_dir, "dataset2")))
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/__init__.py
deleted file mode 100644
index c78b9c5d887aaf085ffea2562befe44cbaa53f29..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/__init__.py
deleted file mode 100644
index c78b9c5d887aaf085ffea2562befe44cbaa53f29..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_evaluator.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_evaluator.py
deleted file mode 100644
index 383347690494667ef59cae54b311881877f13f90..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_evaluator.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""An evaluator object that can evaluate models with a single output."""
-import orbit
-import tensorflow as tf
-
-
-class SingleTaskEvaluator(orbit.StandardEvaluator):
-  """Evaluates a single-output model on a given dataset.
-
-  This evaluator will handle running a model with one output on a single
-  dataset, and will apply the output of that model to one or more
-  `tf.keras.metrics.Metric` objects.
-  """
-
-  def __init__(self,
-               eval_dataset,
-               label_key,
-               model,
-               metrics,
-               evaluator_options=None):
-    """Initializes a `SingleTaskEvaluator` instance.
-
-    If the `SingleTaskEvaluator` should run its model under a distribution
-    strategy, it should be created within that strategy's scope.
-
-    Arguments:
-      eval_dataset: A `tf.data.Dataset` or `DistributedDataset` that contains a
-        string-keyed dict of `Tensor`s.
-      label_key: The key corresponding to the label value in feature
-        dictionaries dequeued from `eval_dataset`. This key will be removed from
-        the dictionary before it is passed to the model.
-      model: A `tf.Module` or Keras `Model` object to evaluate.
-      metrics: A single `tf.keras.metrics.Metric` object, or a list of
-        `tf.keras.metrics.Metric` objects.
-      evaluator_options: An optional `orbit.StandardEvaluatorOptions` object.
-    """
-
-    self.label_key = label_key
-    self.model = model
-    self.metrics = metrics if isinstance(metrics, list) else [metrics]
-
-    # Capture the strategy from the containing scope.
-    self.strategy = tf.distribute.get_strategy()
-
-    super(SingleTaskEvaluator, self).__init__(
-        eval_dataset=eval_dataset, options=evaluator_options)
-
-  def eval_begin(self):
-    """Actions to take once before every eval loop."""
-    for metric in self.metrics:
-      metric.reset_states()
-
-  def eval_step(self, iterator):
-    """One eval step. Called multiple times per eval loop by the superclass."""
-
-    def step_fn(inputs):
-      # Extract the target value and delete it from the input dict, so that
-      # the model never sees it.
-      target = inputs.pop(self.label_key)
-      output = self.model(inputs)
-      for metric in self.metrics:
-        metric.update_state(target, output)
-
-    # This is needed to handle distributed computation.
-    self.strategy.run(step_fn, args=(next(iterator),))
-
-  def eval_end(self):
-    """Actions to take once after an eval loop."""
-    with self.strategy.scope():
-      # Export the metrics.
-      metrics = {metric.name: metric.result() for metric in self.metrics}
-
-    return metrics
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_evaluator_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_evaluator_test.py
deleted file mode 100644
index b7457aa039781b7b2cac7f3207a61521eaafbcbd..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_evaluator_test.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for the single_task_evaluator."""
-import orbit
-from orbit.examples.single_task import single_task_evaluator
-from orbit.examples.single_task import single_task_trainer
-
-import tensorflow as tf
-import tensorflow_datasets as tfds
-
-
-class SingleTaskEvaluatorTest(tf.test.TestCase):
-
-  def test_single_task_evaluation(self):
-
-    iris = tfds.load('iris')
-    train_ds = iris['train'].batch(32)
-
-    model = tf.keras.Sequential([
-        tf.keras.Input(shape=(4,), name='features'),
-        tf.keras.layers.Dense(10, activation=tf.nn.relu),
-        tf.keras.layers.Dense(10, activation=tf.nn.relu),
-        tf.keras.layers.Dense(3)
-    ])
-
-    trainer = single_task_trainer.SingleTaskTrainer(
-        train_ds,
-        label_key='label',
-        model=model,
-        loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-        optimizer=tf.keras.optimizers.SGD(learning_rate=0.01))
-
-    evaluator = single_task_evaluator.SingleTaskEvaluator(
-        train_ds,
-        label_key='label',
-        model=model,
-        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
-
-    controller = orbit.Controller(
-        trainer=trainer,
-        evaluator=evaluator,
-        steps_per_loop=100,
-        global_step=trainer.optimizer.iterations)
-
-    controller.train(train_ds.cardinality().numpy())
-    controller.evaluate()
-    accuracy = evaluator.metrics[0].result().numpy()
-
-    self.assertGreater(0.925, accuracy)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_trainer.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_trainer.py
deleted file mode 100644
index 39c510293d3071173eac1cc866e5da9f7d18be0a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_trainer.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""A trainer object that can train models with a single output."""
-
-import orbit
-import tensorflow as tf
-
-
-class SingleTaskTrainer(orbit.StandardTrainer):
-  """Trains a single-output model on a given dataset.
-
-  This trainer will handle running a model with one output on a single
-  dataset. It will apply the provided loss function to the model's output
-  to calculate gradients and will apply them via the provided optimizer. It will
-  also supply the output of that model to one or more `tf.keras.metrics.Metric`
-  objects.
-  """
-
-  def __init__(self,
-               train_dataset,
-               label_key,
-               model,
-               loss_fn,
-               optimizer,
-               metrics=None,
-               trainer_options=None):
-    """Initializes a `SingleTaskTrainer` instance.
-
-    If the `SingleTaskTrainer` should run its model under a distribution
-    strategy, it should be created within that strategy's scope.
-
-    This trainer will also calculate metrics during training. The loss metric
-    is calculated by default, but other metrics can be passed to the `metrics`
-    arg.
-
-    Arguments:
-      train_dataset: A `tf.data.Dataset` or `DistributedDataset` that contains a
-        string-keyed dict of `Tensor`s.
-      label_key: The key corresponding to the label value in feature
-        dictionaries dequeued from `train_dataset`. This key will be removed
-        from the dictionary before it is passed to the model.
-      model: A `tf.Module` or Keras `Model` object to evaluate. It must accept a
-        `training` kwarg.
-      loss_fn: A per-element loss function of the form (target, output). The
-        output of this loss function will be reduced via `tf.reduce_mean` to
-        create the final loss. We recommend using the functions in the
-        `tf.keras.losses` package or `tf.keras.losses.Loss` objects with
-        `reduction=tf.keras.losses.reduction.NONE`.
-      optimizer: A `tf.keras.optimizers.Optimizer` instance.
-      metrics: A single `tf.keras.metrics.Metric` object, or a list of
-        `tf.keras.metrics.Metric` objects.
-      trainer_options: An optional `orbit.utils.StandardTrainerOptions` object.
-    """
-    self.label_key = label_key
-    self.model = model
-    self.loss_fn = loss_fn
-    self.optimizer = optimizer
-
-    # Capture the strategy from the containing scope.
-    self.strategy = tf.distribute.get_strategy()
-
-    # We always want to report training loss.
-    self.train_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
-
-    # We need self.metrics to be an iterable later, so we handle that here.
-    if metrics is None:
-      self.metrics = []
-    elif isinstance(metrics, list):
-      self.metrics = metrics
-    else:
-      self.metrics = [metrics]
-
-    super(SingleTaskTrainer, self).__init__(
-        train_dataset=train_dataset, options=trainer_options)
-
-  def train_loop_begin(self):
-    """Actions to take once, at the beginning of each train loop."""
-    self.train_loss.reset_states()
-    for metric in self.metrics:
-      metric.reset_states()
-
-  def train_step(self, iterator):
-    """A train step. Called multiple times per train loop by the superclass."""
-
-    def train_fn(inputs):
-      with tf.GradientTape() as tape:
-        # Extract the target value and delete it from the input dict, so that
-        # the model never sees it.
-        target = inputs.pop(self.label_key)
-
-        # Get the outputs of the model.
-        output = self.model(inputs, training=True)
-
-        # Get the average per-batch loss and scale it down by the number of
-        # replicas. This ensures that we don't end up multiplying our loss by
-        # the number of workers - gradients are summed, not averaged, across
-        # replicas during the apply_gradients call.
-        # Note, the reduction of loss is explicitly handled and scaled by
-        # num_replicas_in_sync. Recommend to use a plain loss function.
-        # If you're using tf.keras.losses.Loss object, you may need to set
-        # reduction argument explicitly.
-        loss = tf.reduce_mean(self.loss_fn(target, output))
-        scaled_loss = loss / self.strategy.num_replicas_in_sync
-
-        # Get the gradients by applying the loss to the model's trainable
-        # variables.
-        gradients = tape.gradient(scaled_loss, self.model.trainable_variables)
-
-        # Apply the gradients via the optimizer.
-        self.optimizer.apply_gradients(
-            list(zip(gradients, self.model.trainable_variables)))
-
-        # Update metrics.
-        self.train_loss.update_state(loss)
-        for metric in self.metrics:
-          metric.update_state(target, output)
-
-    # This is needed to handle distributed computation.
-    self.strategy.run(train_fn, args=(next(iterator),))
-
-  def train_loop_end(self):
-    """Actions to take once after a training loop."""
-    with self.strategy.scope():
-      # Export the metrics.
-      metrics = {metric.name: metric.result() for metric in self.metrics}
-      metrics[self.train_loss.name] = self.train_loss.result()
-
-    return metrics
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_trainer_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_trainer_test.py
deleted file mode 100644
index ec6ee87a74aee7614199be2ec4ec03e0fe10ceec..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/examples/single_task/single_task_trainer_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for the single_task_trainer."""
-import orbit
-from orbit.examples.single_task import single_task_trainer
-
-import tensorflow as tf
-import tensorflow_datasets as tfds
-
-
-class SingleTaskTrainerTest(tf.test.TestCase):
-
-  def test_single_task_training(self):
-    iris = tfds.load('iris')
-    train_ds = iris['train'].batch(32).repeat()
-
-    model = tf.keras.Sequential([
-        tf.keras.Input(shape=(4,), name='features'),
-        tf.keras.layers.Dense(10, activation=tf.nn.relu),
-        tf.keras.layers.Dense(10, activation=tf.nn.relu),
-        tf.keras.layers.Dense(3),
-        tf.keras.layers.Softmax(),
-    ])
-
-    trainer = single_task_trainer.SingleTaskTrainer(
-        train_ds,
-        label_key='label',
-        model=model,
-        loss_fn=tf.keras.losses.sparse_categorical_crossentropy,
-        optimizer=tf.keras.optimizers.SGD(learning_rate=0.01))
-
-    controller = orbit.Controller(
-        trainer=trainer,
-        steps_per_loop=100,
-        global_step=trainer.optimizer.iterations)
-
-    controller.train(1)
-    start_loss = trainer.train_loss.result().numpy()
-    controller.train(500)
-    end_loss = trainer.train_loss.result().numpy()
-
-    # Assert that the model has trained 'significantly' - that the loss
-    # has dropped by over 50%.
-    self.assertLess(end_loss, start_loss / 2)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/runner.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/runner.py
deleted file mode 100644
index a6f4bc9809c624890b5c949507a880b4497c27ea..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/runner.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Provides AbstractTrainer/Evaluator base classes, defining train/eval APIs."""
-
-import abc
-
-from typing import Dict, Optional, Union
-
-import numpy as np
-import tensorflow as tf
-
-
-Output = Dict[str, Union[tf.Tensor, float, np.number, np.ndarray, 'Output']]  # pytype: disable=not-supported-yet
-
-
-class AbstractTrainer(tf.Module, metaclass=abc.ABCMeta):
-  """An abstract class defining the API required for training."""
-
-  @abc.abstractmethod
-  def train(self, num_steps: tf.Tensor) -> Optional[Output]:
-    """Implements `num_steps` steps of training.
-
-    This method will be called by the `Controller` to perform the "inner loop"
-    of training. This inner loop amortizes the cost of bookkeeping associated
-    with checkpointing, evaluation, and writing summaries. Additionally, the
-    inner loop can be implemented (if desired) using TensorFlow's looping
-    constructs (e.g. a `for` loop over a `tf.range` inside a `tf.function`),
-    which can be necessary for getting optimal performance when running on TPU.
-    For cases that don't require peak performance, a simple Python loop can be
-    used instead for simplicity.
-
-    Args:
-      num_steps: The number of training steps to run. Note that it is up to the
-        model what constitutes a "step", which may involve more than one update
-        to model parameters (e.g., if training a GAN).
-
-    Returns:
-      Either `None`, or a dictionary mapping names to `Tensor`s or NumPy values.
-      If a dictionary is returned, it will be written to logs and as TensorBoard
-      summaries. The dictionary may also be nested, which will generate a
-      hierarchy of summary directories.
-    """
-    pass
-
-
-class AbstractEvaluator(tf.Module, metaclass=abc.ABCMeta):
-  """An abstract class defining the API required for evaluation."""
-
-  @abc.abstractmethod
-  def evaluate(self, num_steps: tf.Tensor) -> Optional[Output]:
-    """Implements `num_steps` steps of evaluation.
-
-    This method will by called the `Controller` to perform an evaluation. The
-    `num_steps` parameter specifies the number of steps of evaluation to run,
-    which is specified by the user when calling one of the `Controller`'s
-    evaluation methods. A special sentinel value of `-1` is reserved to indicate
-    evaluation should run until the underlying data source is exhausted.
-
-    Args:
-      num_steps: The number of evaluation steps to run. Note that it is up to
-        the model what constitutes a "step". Evaluations may also want to
-        support "complete" evaluations when `num_steps == -1`, running until a
-        given data source is exhausted.
-
-    Returns:
-      Either `None`, or a dictionary mapping names to `Tensor`s or NumPy values.
-      If a dictionary is returned, it will be written to logs and as TensorBoard
-      summaries. The dictionary may also be nested, which will generate a
-      hierarchy of summary directories.
-    """
-    pass
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/standard_runner.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/standard_runner.py
deleted file mode 100644
index 6ae407394cc1994b484f02aeaf125eec80a51cea..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/standard_runner.py
+++ /dev/null
@@ -1,459 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""AbstractTrainer/Evaluator subclasses with added functionality.
-
-The classes in this module provide some additional structure to the bare
-`AbstractTrainer`/`AbstractEvaluator` APIs.
-
-Both `StandardTrainer` and `StandardEvaluator` split the train/eval loops into
-"begin", "step", and "end" methods, and provide an implementation of the loop
-itself that makes calls to the relevant step method.
-
-`StandardTrainer` supports running the loop using the TF while loop construct
-for added performance (particularly on TPUs). It additionally provides some
-functionality to make writing summaries from inside a model more performant when
-running on TPUs.
-
-These classes are intended to work well in common settings, however there may
-be use cases these classes don't support (for instance, `StandardEvaluator` in
-particular doesn't support running full evaluations over multiple different eval
-datasets). Users are encouraged to simply fall back to custom `AbstractTrainer`
-and `AbstractEvaluator` subclasses in these cases.
-"""
-
-import abc
-
-from typing import Any, Optional
-
-import dataclasses
-
-from orbit import runner
-from orbit.utils import loop_fns
-
-import tensorflow as tf
-
-
-@dataclasses.dataclass(frozen=True)
-class StandardTrainerOptions:
-  """Advanced options for `orbit.StandardTrainer`.
-
-  Attributes:
-    use_tf_function: A boolean indicating whether to apply `tf.function` to the
-      training loop. This will only affect the body of the loop (involving
-      `train_step`); `train_loop_begin` and `train_loop_end` will always be run
-      in eager mode.
-    use_tf_while_loop: A boolean indicating whether to run the training loop
-      using a `tf.while_loop`. If `True`, `use_tf_function` must also be `True`.
-    use_tpu_summary_optimization: A boolean indicating whether to enable a
-      performance optimization for summaries in TPUs. Writing summaries
-      conditionally with outside compilation on TPUs can be extremely slow. If
-      `True`, this optimization creates two `tf.function`s with two XLA programs
-      (one with summary calls, and one without). The program with summaries runs
-      only for one step when summaries should be recorded.
-  """
-  use_tf_function: bool = True
-  use_tf_while_loop: bool = True
-  use_tpu_summary_optimization: bool = False
-
-
-class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
-  """Implements standard functionality on top of the AbstractTrainer API.
-
-  This class structures the training "inner loop" roughly as follows:
-
-      train_loop_begin()
-      for _ in range(num_steps):
-        train_step(train_iterator)
-      return train_loop_end()
-
-  Calls to `train_loop_begin` and `train_loop_end` are always done in eager
-  mode, while the loop/`train_step` may be implemented using `tf.while` and/or
-  `tf.function`, as determined by the `options` passed to `__init__`.
-  """
-
-  def __init__(self, train_dataset, options: StandardTrainerOptions = None):
-    """Initializes the `StandardTrainer` instance.
-
-    Args:
-      train_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
-        `DistributedDataset`.
-      options: An `orbit.StandardTrainerOptions` instance.
-    """
-    options = options or StandardTrainerOptions()
-    if options.use_tf_while_loop and not options.use_tf_function:
-      raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` "
-                       "is not supported")
-    if options.use_tpu_summary_optimization and not options.use_tf_while_loop:
-      raise ValueError("`use_tpu_summary_optimization=True` and "
-                       "`use_tf_while_loop=False` is not supported")
-
-    self._train_options = options
-    self._train_dataset = train_dataset
-    self._train_iter = None
-    self._train_loop_fn = None
-
-  def create_train_loop_fn(self):
-    """Creates a training loop from the current step function and options.
-
-    Returns:
-      The train loop function, i.e. wrapper of multiple train steps.
-    """
-    train_step_fn = self.train_step
-    if self._train_options.use_tf_while_loop:
-      loop_fn = loop_fns.create_tf_while_loop_fn(train_step_fn)
-      if self._train_options.use_tpu_summary_optimization:
-        loop_fn = loop_fns.LoopFnWithSummaries(loop_fn)
-      else:
-        loop_fn = tf.function(loop_fn)
-    else:
-      if self._train_options.use_tf_function:
-        train_step_fn = tf.function(train_step_fn)
-      loop_fn = loop_fns.create_loop_fn(train_step_fn)
-    return loop_fn
-
-  def train(self, num_steps: tf.Tensor) -> Optional[runner.Output]:
-    """Implements `num_steps` steps of training.
-
-    Args:
-      num_steps: The number of training steps to run. This corresponds directly
-        to the number of calls made to `train_step`.
-
-    Returns:
-      The output of `train_loop_end`.
-    """
-    self.train_loop_begin()
-
-    if self._train_loop_fn is None:
-      self._train_loop_fn = self.create_train_loop_fn()
-
-    if self._train_iter is None:
-      self._train_iter = tf.nest.map_structure(iter, self.train_dataset)
-
-    self._train_loop_fn(self._train_iter, num_steps)
-    return self.train_loop_end()
-
-  def train_loop_begin(self):
-    """Called once at the beginning of the training loop.
-
-    This method is always called in eager mode, and is a good place to reset
-    metrics that accumulate values over multiple steps of training.
-
-    Note that this method is called before dataset iterator creation.
-    """
-    pass
-
-  @abc.abstractmethod
-  def train_step(self, iterator):
-    """Implements one step of training.
-
-    What a "step" consists of is up to the implementer. When using distribution
-    strategies, the call to this method takes place in the "cross-replica
-    context" for generality, to allow e.g. multiple iterator dequeues and calls
-    to `strategy.run`.
-
-    Note that if `use_tf_function=True`, all the code inside `train_step` should
-    be compatible with `tf.function` tracing (and in particular, any state
-    modifications involving `self` should be avoided). In some cases, non-
-    `tf.function` compatible code can be moved to `train_loop_begin` or
-    `train_loop_end`, which always execute eagerly.
-
-    Args:
-      iterator: A `tf.nest`-compatible structure of `tf.data.Iterator` or
-        `DistributedIterator`. The structure of this input matches the structure
-        of `train_dataset` as passed to `__init__`.
-    """
-    pass
-
-  def train_loop_end(self) -> Optional[runner.Output]:
-    """Called once at the end of the training loop.
-
-    This method is always called in eager mode, and is a good place to get
-    metric results. The value returned from this function will be returned as-is
-    from the `train` method implementation provided by `StandardTrainer`.
-
-    Returns:
-      The function may return a dictionary of `Tensors`, which will be
-      written to logs and as TensorBoard summaries. It can also be a
-      nested dictionary, yielding a hierarchy of summary directories.
-    """
-    pass
-
-  @property
-  def train_dataset(self):
-    """The current training dataset."""
-    return self._train_dataset
-
-  @train_dataset.setter
-  def train_dataset(self, train_dataset):
-    """Sets a new training dataset, replacing the current one.
-
-    Any unprocessed examples in the current dataset are discarded.
-
-    Args:
-      train_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
-        `DistributedDataset`.
-    """
-    self._train_dataset = train_dataset
-    self._train_iter = None
-
-
-@dataclasses.dataclass(frozen=True)
-class StandardEvaluatorOptions:
-  """Advanced options for the `orbit.StandardEvaluator`.
-
-  Attributes:
-    use_tf_function: A boolean indicating whether to apply `tf.function` to the
-      evaluation loop. This will only affect the body of the loop (involving
-      `eval_step`); `eval_loop_begin` and `eval_loop_end` will always be run
-      in eager mode.
-    use_tf_while_loop: A boolean indicating whether to run the evaluation loop
-      using a `tf.while_loop`. If `True`, `use_tf_function` must also be `True`.
-    recreate_iterator_for_each_eval: A boolean indicating whether to recreate a
-      new iterator for the evaluation dataset before each round of evaluation,
-      which implies each round of evaluation starts from the beginning of
-      the evaluation dataset. For example, the evaluation dataset is
-      `[1, 2, 3, 4]`, batch size is 1 and evaluation steps is 2. If `True`, the
-      data to be evaluated is [1, 2] every time. If `False`, the iterator
-      state is maintained between calls to `StandardEvaluator.evaluate()`.
-  """
-  use_tf_function: bool = True
-  use_tf_while_loop: bool = False
-  recreate_iterator_for_each_eval: bool = True
-
-
-class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
-  """Implements the standard functionality of AbstractEvaluator APIs.
-
-  This class structures evaluation roughly as follows:
-
-      state = eval_begin()
-      for _ in range(num_steps):
-        step_outputs = eval_step(eval_iterator)
-        state = eval_reduce(state, step_outputs)
-      return eval_end(state)
-
-  Calls to `eval_begin` and `eval_end` are always done in eager
-  mode, while `eval_step` may be compiled with `tf.function` as determined by
-  the `options` passed to `__init__`. `eval_reduce` is in eager mode if
-  `use_tf_while_loop=False` in `StandardEvaluatorOptions`, but in graph mode if
-  `use_tf_while_loop=True`.
-
-  This class does not support completely evaluating multiple different datasets
-  (i.e., where every example of each dataset should be processed, as opposed to
-  running for a fixed number of evaluation steps). A custom `AbstractEvaluator`
-  is recommended in this case.
-  """
-
-  def __init__(self, eval_dataset, options: StandardEvaluatorOptions = None):
-    """Initializes the `StandardEvaluator` instance.
-
-    Args:
-      eval_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
-        `DistributedDataset`.
-      options: An `orbit.StandardEvaluatorOptions` instance.
-    """
-    options = options or StandardEvaluatorOptions()
-    if options.use_tf_while_loop and not options.use_tf_function:
-      raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` "
-                       "is not supported")
-
-    self._eval_options = options
-    self._eval_dataset = eval_dataset
-    self._eval_iter = None
-    self._eval_loop_fn = None
-
-  def create_eval_loop_fn(self, has_state: bool):
-    """Creates an eval loop from the current step function and options.
-
-    Args:
-      has_state: If the step function has state, state will be kept in the loop.
-
-    Returns:
-      The eval loop function, i.e. wrapper of multiple eval steps.
-    """
-    eval_step_fn = self.eval_step
-    if self._eval_options.use_tf_while_loop:
-      # TODO(b/176126742): tf.while_loop doesn't support `None` as a loop input
-      # even when it is not used inside the loop. To workaround this limitation,
-      # we have to build two tf.functions for it.
-      if has_state:
-        loop_fn = loop_fns.create_tf_while_loop_fn_with_state(eval_step_fn)
-      else:
-        loop_fn = loop_fns.create_tf_while_loop_fn(eval_step_fn)
-      loop_fn = tf.function(loop_fn)
-    else:
-      if self._eval_options.use_tf_function:
-        eval_step_fn = tf.function(eval_step_fn)
-      loop_fn = loop_fns.create_loop_fn(eval_step_fn)
-    return loop_fn
-
-  def evaluate(self, num_steps: tf.Tensor) -> Optional[runner.Output]:
-    """Implements `num_steps` steps of evaluation.
-
-    Args:
-      num_steps: The number of evaluation steps to run. When this is -1,
-        evaluation proceeds until a call to `eval_step` raises a `StopIteration`
-        or `tf.errors.OutOfRangeError`.
-
-    Returns:
-      The output of `self.eval_end()`.
-
-    Raises:
-      ValueError: If `options.use_tf_while_loop` is `True` and `num_steps` is
-        unspecified.
-    """
-    if self._eval_options.use_tf_while_loop and num_steps == -1:
-      raise ValueError("Looping until exhausted is not supported if "
-                       "`options.use_tf_while_loop` is `True`")
-
-    outputs = self.eval_begin()  # pylint: disable=assignment-from-no-return
-
-    has_state = outputs is not None
-    if self._eval_loop_fn is None:
-      self._eval_loop_fn = self.create_eval_loop_fn(has_state)
-
-    # If `recreate_iterator_for_each_eval` is `True`, `self._eval_iter` is
-    # always None.
-    if self._eval_iter is None:
-      eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
-      if not self._eval_options.recreate_iterator_for_each_eval:
-        self._eval_iter = eval_iter
-    else:
-      eval_iter = self._eval_iter
-
-    if self._eval_options.use_tf_while_loop and not has_state:
-      self._eval_loop_fn(eval_iter, num_steps)
-    else:
-      outputs = self._eval_loop_fn(
-          eval_iter, num_steps, state=outputs, reduce_fn=self.eval_reduce)
-
-    if outputs is None:
-      return self.eval_end()
-    else:
-      return self.eval_end(outputs)
-
-  def eval_begin(self) -> Any:
-    """Called once at the beginning of the evaluation.
-
-    This method is always called in eager mode, and is a good place to reset
-    metrics that accumulate values over the course of evaluation.
-
-    Note that this method is called before dataset iterator creation.
-
-    Returns:
-      An value to pass as the `state` argument to `eval_reduce`.
-    """
-    pass
-
-  @abc.abstractmethod
-  def eval_step(self, iterator) -> Any:
-    """Implements one step of evaluation.
-
-    What a "step" consists of is up to the implementer. When using distribution
-    strategies, the call to this method takes place in the "cross-replica
-    context" for generality, to allow e.g. multiple iterator dequeues and calls
-    to `strategy.run`.
-
-    Note that if `use_tf_function=True`, all the code inside `eval_step` should
-    be compatible with `tf.function` tracing (and in particular, any state
-    modifications involving `self` should be avoided). In some cases, non-
-    `tf.function` compatible code can be moved to `eval_loop_begin`,
-    `eval_reduce`, or `eval_loop_end`, which always execute eagerly.
-
-    Args:
-      iterator: A `tf.nest`-compatible structure of `tf.data.Iterator` or
-        `DistributedIterator`.
-
-    Returns:
-      An output which is passed as `step_outputs` argument into `eval_reduce`
-      function.
-    """
-    pass
-
-  def eval_end(self, *args) -> Optional[runner.Output]:
-    """Called at the end of the evaluation.
-
-    Called once at the end of evaluation.
-
-    This method is always called in eager mode, and is a good place to get
-    metric results. The value returned from this function will be returned as-is
-    from the `evaluate` method implementation provided by `StandardEvaluator`.
-
-    Args:
-      *args: The outputs from `eval_reduce` for the last eval step, if they are
-        non-`None` (if they are `None`, nothing is passed).
-
-    Returns:
-      The function may return a dictionary of `Tensors`, which will be
-      written to logs and as TensorBoard summaries. It can also be a
-      nested dictionary, yielding a hierarchy of summary directories.
-    """
-    pass
-
-  def eval_reduce(self,
-                  state: Any = None,
-                  step_outputs: Optional[runner.Output] = None) -> Any:
-    """A function to perform per-step reduction on the evaluation outputs.
-
-    This is useful for passing state throughout evaluation, especially in cases
-    where maintaining or accumulating state is hard to accomplish using
-    `tf.metrics.Metric` or other `tf.Variable`-based approaches. For instance,
-    it can be used to easily accumulate all per-example losses from the full
-    evaluation for subsequent processing in `eval_end()`.
-
-    Args:
-      state: A state being mainted throughout the evaluation.
-      step_outputs: Outputs from the current evaluation step.
-
-    Returns:
-      An output which is passed as the `state` argument to this function for the
-      next step. After evaluation is finished, the output from last step will be
-      passed to `eval_end`.
-    """
-    pass
-
-  @property
-  def eval_dataset(self):
-    """The current evaluation dataset."""
-    return self._eval_dataset
-
-  @eval_dataset.setter
-  def eval_dataset(self, eval_dataset):
-    """Sets a new eval dataset, replacing the current one.
-
-    Any unprocessed examples in the current dataset are discarded.
-
-    Args:
-      eval_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
-        `DistributedDataset`.
-    """
-    self._eval_dataset = eval_dataset
-    self._eval_iter = None
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/standard_runner_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/standard_runner_test.py
deleted file mode 100644
index 9e84895ecf7b32a5e4e6749182c14fbcb758b653..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/standard_runner_test.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for orbit.standard_runner."""
-
-from absl.testing import parameterized
-
-from orbit import standard_runner
-from orbit import utils
-
-import tensorflow as tf
-
-
-def dataset_fn(input_context=None):
-  del input_context
-
-  def dummy_data(_):
-    return tf.zeros((1, 1), dtype=tf.float32)
-
-  dataset = tf.data.Dataset.range(1)
-  dataset = dataset.repeat()
-  dataset = dataset.map(
-      dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  return dataset
-
-
-class TestTrainer(standard_runner.StandardTrainer):
-  """A StandardTrainer subclass for tests."""
-
-  def __init__(self, options=None):
-    self.strategy = tf.distribute.get_strategy()
-    self.global_step = utils.create_global_step()
-    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
-    super().__init__(train_dataset=dataset, options=options)
-
-  def train_loop_begin(self):
-    self.global_step.assign(0)
-
-  def train_step(self, iterator):
-
-    def replica_step(_):
-      self.global_step.assign_add(1)
-
-    self.strategy.run(replica_step, args=(next(iterator),))
-
-  def train_loop_end(self):
-    return self.global_step.numpy()
-
-
-class TestEvaluator(standard_runner.StandardEvaluator):
-  """A StandardEvaluator subclass for tests."""
-
-  def __init__(self, options=None):
-    self.strategy = tf.distribute.get_strategy()
-    self.global_step = utils.create_global_step()
-    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
-    super().__init__(eval_dataset=dataset, options=options)
-
-  def eval_begin(self):
-    self.global_step.assign(0)
-
-  def eval_step(self, iterator):
-
-    def replica_step(_):
-      self.global_step.assign_add(1)
-
-    self.strategy.run(replica_step, args=(next(iterator),))
-
-  def eval_end(self):
-    return self.global_step.numpy()
-
-
-class TestEvaluatorWithOutputsAggregation(standard_runner.StandardEvaluator):
-  """A StandardEvaluator subclass for tests."""
-
-  def __init__(self, options=None):
-    self.strategy = tf.distribute.get_strategy()
-    dataset = self.strategy.distribute_datasets_from_function(
-        lambda _: tf.data.Dataset.range(10))
-    super().__init__(eval_dataset=dataset, options=options)
-
-  def eval_begin(self):
-    return tf.constant((0.0,))
-
-  def eval_reduce(self, state, step_outputs):
-    state = tf.concat([state, step_outputs], 0)
-    return state
-
-  def eval_step(self, iterator):
-
-    def replica_step(x):
-      x = tf.cast(x, tf.float32)
-      return tf.reduce_sum(x)
-
-    return self.strategy.experimental_local_results(
-        self.strategy.run(replica_step, args=(next(iterator),)))
-
-  def eval_end(self, outputs):
-    return tf.reduce_sum(outputs)
-
-
-class StandardRunnerTest(parameterized.TestCase):
-
-  def test_default_trainer(self):
-    trainer = TestTrainer()
-    self.assertEqual(trainer.train(tf.constant(10)), 10)
-
-  def test_trainer_with_tpu_summary_optimization(self):
-    options = standard_runner.StandardTrainerOptions(
-        use_tpu_summary_optimization=True)
-    trainer = TestTrainer(options)
-    self.assertEqual(trainer.train(tf.constant(10)), 10)
-
-  @parameterized.named_parameters(("use_tf_while_loop", True), ("", False))
-  def test_default_evaluator(self, use_tf_while_loop):
-    options = standard_runner.StandardEvaluatorOptions(
-        use_tf_while_loop=use_tf_while_loop)
-    evaluator = TestEvaluator(options)
-    self.assertEqual(evaluator.evaluate(tf.constant(10)), 10)
-
-  @parameterized.named_parameters(("use_tf_while_loop", True), ("", False))
-  def test_evaluator_with_outputs_aggregation(self, use_tf_while_loop):
-    options = standard_runner.StandardEvaluatorOptions(
-        use_tf_while_loop=use_tf_while_loop)
-    evaluator = TestEvaluatorWithOutputsAggregation(options)
-    self.assertEqual(evaluator.evaluate(tf.constant(10)), 45)
-
-  @parameterized.named_parameters(
-      ("recreate_iterator_for_each_eval", True, 10, 10),
-      ("not_recreate_iterator_for_each_eval", False, 10, 35))
-  def test_evaluator_with_repeat_dataset(self, recreate_iterator_for_each_eval,
-                                         sum_for_1st_time, sum_for_2nd_time):
-    options = standard_runner.StandardEvaluatorOptions(
-        recreate_iterator_for_each_eval=recreate_iterator_for_each_eval)
-    evaluator = TestEvaluatorWithOutputsAggregation(options)
-    self.assertEqual(evaluator.evaluate(tf.constant(5)), sum_for_1st_time)
-    self.assertEqual(evaluator.evaluate(tf.constant(5)), sum_for_2nd_time)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/__init__.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/__init__.py
deleted file mode 100644
index 5c75ef35a2d3d56d949e2a2405a2d331a7a5b1e6..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/__init__.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Defines exported symbols for the `orbit.utils` package."""
-
-from orbit.utils.common import create_global_step
-from orbit.utils.common import get_value
-from orbit.utils.common import make_distributed_dataset
-
-from orbit.utils.epoch_helper import EpochHelper
-
-from orbit.utils.loop_fns import create_loop_fn
-from orbit.utils.loop_fns import create_tf_while_loop_fn
-from orbit.utils.loop_fns import LoopFnWithSummaries
-
-from orbit.utils.summary_manager import SummaryManager
-
-from orbit.utils.tpu_summaries import OptionalSummariesFunction
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/common.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/common.py
deleted file mode 100644
index f0699b2c32215d562bec0b0760d42379ee284408..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/common.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Some layered modules/functions to help users writing custom training loop."""
-
-import inspect
-
-import tensorflow as tf
-
-
-def create_global_step() -> tf.Variable:
-  """Creates a `tf.Variable` suitable for use as a global step counter.
-
-  Creating and managing a global step variable may be necessary for
-  `AbstractTrainer` subclasses that perform multiple parameter updates per
-  `Controller` "step", or use different optimizers on different steps.
-
-  In these cases, an `optimizer.iterations` property generally can't be used
-  directly, since it would correspond to parameter updates instead of iterations
-  in the `Controller`'s training loop. Such use cases should simply call
-  `step.assign_add(1)` at the end of each step.
-
-  Returns:
-    A non-trainable scalar `tf.Variable` of dtype `tf.int64`, with only the
-    first replica's value retained when synchronizing across replicas in
-    a distributed setting.
-  """
-  return tf.Variable(
-      0,
-      dtype=tf.int64,
-      name="global_step",
-      trainable=False,
-      aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
-
-
-def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
-  """A utility function to help create a `tf.distribute.DistributedDataset`.
-
-  Args:
-    strategy: An instance of `tf.distribute.Strategy`.
-    dataset_or_fn: A instance of `tf.data.Dataset`, or a "dataset function"
-      returning a `tf.data.Dataset`. If it is a function, it may optionally have
-      an argument named `input_context` which will be passed a
-      `tf.distribute.InputContext` instance.
-    *args: Any positional arguments to pass through to `dataset_or_fn`.
-    **kwargs: Any keyword arguments to pass through to `dataset_or_fn`.
-
-  Returns:
-    A distributed Dataset.
-  """
-  if strategy is None:
-    strategy = tf.distribute.get_strategy()
-
-  if isinstance(dataset_or_fn, tf.data.Dataset):
-    return strategy.experimental_distribute_dataset(dataset_or_fn)
-
-  if not callable(dataset_or_fn):
-    raise ValueError("`dataset_or_fn` should be either callable or an instance "
-                     "of `tf.data.Dataset`.")
-
-  def dataset_fn(input_context):
-    """Wraps `dataset_or_fn` for strategy.distribute_datasets_from_function."""
-
-    # If `dataset_or_fn` is a function and has an argument named
-    # `input_context`, pass through the given `input_context`. Otherwise
-    # `input_context` will be ignored.
-    argspec = inspect.getfullargspec(dataset_or_fn)
-    arg_names = argspec.args
-
-    if "input_context" in arg_names:
-      kwargs["input_context"] = input_context
-    return dataset_or_fn(*args, **kwargs)
-
-  return strategy.distribute_datasets_from_function(dataset_fn)
-
-
-def get_value(x):
-  """Returns input values, converting any TensorFlow values to NumPy values.
-
-  Args:
-    x: The input. May be a `tf.Tensor` or `tf.Variable`.
-
-  Returns:
-    If the input is a TensorFlow `Tensor`, returns the `Tensor`'s equivalent
-    NumPy value. Otherwise, just returns the input.
-  """
-  if not tf.is_tensor(x):
-    return x
-  return x.numpy()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/common_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/common_test.py
deleted file mode 100644
index 79e5b6901459d2f4a4f012e84089fa38278d5114..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/common_test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for orbit.utils.common."""
-
-from orbit.utils import common
-
-import tensorflow as tf
-
-
-class UtilsTest(tf.test.TestCase):
-
-  def test_create_global_step(self):
-    step = common.create_global_step()
-    self.assertEqual(step.name, "global_step:0")
-    self.assertEqual(step.dtype, tf.int64)
-    self.assertEqual(step, 0)
-    step.assign_add(1)
-    self.assertEqual(step, 1)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/epoch_helper.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/epoch_helper.py
deleted file mode 100644
index 81ebd2b24eee3556449bcd4a216e6015b0092a69..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/epoch_helper.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Provides a utility class for training in epochs."""
-
-import tensorflow as tf
-
-
-class EpochHelper:
-  """A helper class handle bookkeeping of epochs in custom training loops."""
-
-  def __init__(self, epoch_steps: int, global_step: tf.Variable):
-    """Initializes the `EpochHelper` instance.
-
-    Args:
-      epoch_steps: An integer indicating how many steps are in an epoch.
-      global_step: A `tf.Variable` providing the current global step.
-    """
-    self._epoch_steps = epoch_steps
-    self._global_step = global_step
-    self._current_epoch = None
-    self._epoch_start_step = None
-    self._in_epoch = False
-
-  def epoch_begin(self):
-    """Returns whether a new epoch should begin."""
-    if self._in_epoch:
-      return False
-    current_step = self._global_step.numpy()
-    self._epoch_start_step = current_step
-    self._current_epoch = current_step // self._epoch_steps
-    self._in_epoch = True
-    return True
-
-  def epoch_end(self):
-    """Returns whether the current epoch should end."""
-    if not self._in_epoch:
-      raise ValueError("`epoch_end` can only be called inside an epoch.")
-    current_step = self._global_step.numpy()
-    epoch = current_step // self._epoch_steps
-
-    if epoch > self._current_epoch:
-      self._in_epoch = False
-      return True
-    return False
-
-  @property
-  def batch_index(self):
-    """Index of the next batch within the current epoch."""
-    return self._global_step.numpy() - self._epoch_start_step
-
-  @property
-  def current_epoch(self):
-    return self._current_epoch
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/loop_fns.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/loop_fns.py
deleted file mode 100644
index ec1665b5481fa32ea8b07dde39d8fa566b9ea0a1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/loop_fns.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Utilities for creating loop functions."""
-
-from orbit.utils import tpu_summaries
-
-import tensorflow as tf
-
-
-def create_loop_fn(step_fn):
-  """Creates a loop function driven by a Python `while` loop.
-
-  Args:
-    step_fn: A function taking a nested structure of `tf.data.Iterator` or
-      `DistributedIterator`. There are no constraints on the return value of the
-      function (except that it must be compatible with any `reduce_fn` provided
-      to the returned `loop_fn`).
-
-  Returns:
-    A loop function taking required `iterator` and `num_steps` parameters, as
-    well as optional `state` and `reduce_fn` parameters for accumulating state
-    over multiple iterations of the loop. See the `loop_fn` definition below for
-    additional details.
-  """
-
-  def loop_fn(iterator, num_steps, state=None, reduce_fn=None):
-    """Makes `num_steps` calls to `step_fn(iterator)`.
-
-    Additionally, state may be accumulated across iterations of the loop.
-    Conceptually, state accumulation is handled roughly as follows:
-
-        for _ in range(num_steps):
-          step_outputs  = step_fn(iterator)
-          state = reduce_fn(state, step_outputs)
-        return state
-
-    However, the implementation is slightly more complicated in order to support
-    looping until the iterator is exhausted (when `num_steps == -1`) and to
-    properly catch exceptions when running under async remote eager (as is the
-    case in TPU training setups involving separate coordinator/worker machines).
-
-    Args:
-      iterator: A nested structure of `tf.data.Iterator` or
-        `DistributedIterator`.
-      num_steps: The number of steps in the loop. If `num_steps == -1`, will
-        iterate until exausting the iterator.
-      state: An optional initial state before running the loop.
-      reduce_fn: A callable taking two inputs, `state` and `value`, where
-        `state` is the previous output from `reduce_fn`, and `value` is the
-        output from `step_fn`.
-
-    Returns:
-      The final state returned by `reduce_fn`, or `None` if `state` and
-      `reduce_fn` are not provided.
-    """
-    try:
-      step = 0
-      # To make sure the OutOfRangeError exception can be handled well under
-      # async remote eager, we need to wrap the loop body in `async_scope`.
-      with tf.experimental.async_scope():
-        while num_steps == -1 or step < num_steps:
-          outputs = step_fn(iterator)
-          if reduce_fn is not None:
-            state = reduce_fn(state, outputs)
-          step += 1
-        return state
-    except (StopIteration, tf.errors.OutOfRangeError):
-      tf.experimental.async_clear_error()
-      return state
-
-  return loop_fn
-
-
-def create_tf_while_loop_fn(step_fn):
-  """Creates a loop function compatible with TF's AutoGraph loop conversion.
-
-  Args:
-    step_fn: A function taking a nested structure of `tf.data.Iterator` or
-      `DistributedIterator`. Currently, any return values are ignored.
-
-  Returns:
-    A loop function taking required `iterator` and `num_steps` parameters. If
-    called inside a `tf.function`, the loop will be converted by AutoGraph into
-    a `tf.while_loop` construct. See the `loop_fn` definition below for
-    additional details.
-  """
-
-  def loop_fn(iterator, num_steps):
-    """Makes `num_steps` calls to `step_fn(iterator)`.
-
-    Args:
-      iterator: A nested structure of `tf.data.Iterator` or
-        `DistributedIterator`.
-      num_steps: The number of steps in the loop. Should be passed as a
-        `tf.Tensor`. Iterating until iterator exhaustion is not supported.
-    """
-    if not isinstance(num_steps, tf.Tensor):
-      raise ValueError(
-          "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
-          "cause unnecessary retracing when wrapped by `tf.function`.")
-
-    for _ in tf.range(num_steps):
-      # Clear out the outer name scope so the ops created inside `tf.while_loop`
-      # don't get "while/" as name prefix.
-      with tf.name_scope(""):
-        step_fn(iterator)
-
-  return loop_fn
-
-
-def create_tf_while_loop_fn_with_state(step_fn):
-  """Creates a TF while loop function with state.
-
-  This function is similar to `create_tf_while_loop_fn`, but allowing a `state`
-  to be accumulated over multiple iterations of the loop. Note that the
-  structure of the `state` cannot be changed across iterations.
-
-  Args:
-    step_fn: A function taking a nested structure of `tf.data.Iterator` or
-      `DistributedIterator`. Currently, any return values are ignored.
-
-  Returns:
-    A loop function taking required `iterator`, `num_steps`, `state` and
-    `reduce_fn` parameters. If called inside a `tf.function`, the loop will be
-    converted by AutoGraph into a `tf.while_loop` construct. See the `loop_fn`
-    definition below for additional details.
-  """
-
-  def loop_fn_with_state(iterator, num_steps, state, reduce_fn):
-    """Makes `num_steps` calls to `step_fn(iterator)`.
-
-    Args:
-      iterator: A nested structure of `tf.data.Iterator` or
-        `DistributedIterator`.
-      num_steps: The number of steps in the loop. Should be passed as a
-        `tf.Tensor`. Iterating until iterator exhaustion is not supported.
-      state: An initial state before running the loop.
-      reduce_fn: A callable taking two inputs, `state` and `value`, where
-        `state` is the previous output from `reduce_fn`, and `value` is the
-        output from `step_fn`.
-
-    Returns:
-      The final state returned by `reduce_fn`.
-    """
-    if not isinstance(num_steps, tf.Tensor):
-      raise ValueError(
-          "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
-          "cause unnecessary retracing when wrapped by `tf.function`.")
-
-    for _ in tf.range(num_steps):
-      # Clear out the outer name scope so the ops created inside `tf.while_loop`
-      # don't get "while/" as name prefix.
-      with tf.name_scope(""):
-        # Relax the shapes within the loop, so the shape of `state` can change
-        # across iterations. This is useful to aggregate outputs from each step
-        # and concat to `state`.
-        tf.autograph.experimental.set_loop_options(
-            shape_invariants=[(t, tf.TensorShape([None] * t.shape.rank))
-                              for t in tf.nest.flatten(state)
-                              if tf.is_tensor(t)])
-        outputs = step_fn(iterator)
-        state = reduce_fn(state, outputs)
-    return state
-
-  return loop_fn_with_state
-
-
-class LoopFnWithSummaries(tpu_summaries.OptionalSummariesFunction):
-  """Implements a two-program approach for optimizing summaries on TPU.
-
-  This version works with the result of `create_tf_while_loop_fn`.
-  """
-
-  def __call__(self, iterator, num_steps):
-    if tf.summary.should_record_summaries():
-      output = self.with_summaries(iterator, tf.constant(1))
-      num_steps -= 1
-    if num_steps >= 1:
-      output = self.without_summaries(iterator, num_steps)
-    return output
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/summary_manager.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/summary_manager.py
deleted file mode 100644
index c5b815da84098b5d0e562c237c5bca9c97b1e7b0..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/summary_manager.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Provides a utility class for managing summary writing."""
-
-import os
-
-import tensorflow as tf
-
-
-class SummaryManager:
-  """A utility class for managing summary writing."""
-
-  def __init__(self, summary_dir, summary_fn, global_step=None):
-    """Initializes the `SummaryManager` instance.
-
-    Args:
-      summary_dir: The directory in which to write summaries. If `None`, all
-        summary writing operations provided by this class are no-ops.
-      summary_fn: A callable defined accepting `name`, `value`, and `step`
-        parameters, making calls to `tf.summary` functions to write summaries.
-      global_step: A `tf.Variable` containing the global step value.
-    """
-    self._enabled = summary_dir is not None
-    self._summary_dir = summary_dir
-    self._summary_fn = summary_fn
-    self._summary_writers = {}
-
-    if global_step is None:
-      self._global_step = tf.summary.experimental.get_step()
-    else:
-      self._global_step = global_step
-
-  def summary_writer(self, relative_path=""):
-    """Returns the underlying summary writer for a specific subdirectory.
-
-    Args:
-      relative_path: The current path in which to write summaries, relative to
-        the summary directory. By default it is empty, which corresponds to the
-        root directory.
-    """
-    if self._summary_writers and relative_path in self._summary_writers:
-      return self._summary_writers[relative_path]
-    if self._enabled:
-      self._summary_writers[relative_path] = tf.summary.create_file_writer(
-          os.path.join(self._summary_dir, relative_path))
-    else:
-      self._summary_writers[relative_path] = tf.summary.create_noop_writer()
-    return self._summary_writers[relative_path]
-
-  def flush(self):
-    """Flushes the underlying summary writers."""
-    if self._enabled:
-      tf.nest.map_structure(tf.summary.flush, self._summary_writers)
-
-  def write_summaries(self, summary_dict):
-    """Writes summaries for the given dictionary of values.
-
-    This recursively creates subdirectories for any nested dictionaries
-    provided in `summary_dict`, yielding a hierarchy of directories which will
-    then be reflected in the TensorBoard UI as different colored curves.
-
-    For example, users may evaluate on multiple datasets and return
-    `summary_dict` as a nested dictionary:
-
-        {
-            "dataset1": {
-                "loss": loss1,
-                "accuracy": accuracy1
-            },
-            "dataset2": {
-                "loss": loss2,
-                "accuracy": accuracy2
-            },
-        }
-
-    This will create two subdirectories, "dataset1" and "dataset2", inside the
-    summary root directory. Each directory will contain event files including
-    both "loss" and "accuracy" summaries.
-
-    Args:
-      summary_dict: A dictionary of values. If any value in `summary_dict` is
-        itself a dictionary, then the function will create a subdirectory with
-        name given by the corresponding key. This is performed recursively. Leaf
-        values are then summarized using the summary writer instance specific to
-        the parent relative path.
-    """
-    if not self._enabled:
-      return
-    self._write_summaries(summary_dict)
-
-  def _write_summaries(self, summary_dict, relative_path=""):
-    for name, value in summary_dict.items():
-      if isinstance(value, dict):
-        self._write_summaries(
-            value, relative_path=os.path.join(relative_path, name))
-      else:
-        with self.summary_writer(relative_path).as_default():
-          self._summary_fn(name, value, step=self._global_step)
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/tpu_summaries.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/tpu_summaries.py
deleted file mode 100644
index 5c19da32a10a539a3755d747e428810aa6245286..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/tpu_summaries.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Contains utilities for TPU summary optimization."""
-
-import contextlib
-import functools
-
-import tensorflow as tf
-
-
-@contextlib.contextmanager
-def _soft_device_placement():
-  """Context manager for soft device placement, allowing summaries on CPU."""
-  original_setting = tf.config.get_soft_device_placement()
-  try:
-    tf.config.set_soft_device_placement(True)
-    yield
-  finally:
-    tf.config.set_soft_device_placement(original_setting)
-
-
-class OptionalSummariesFunction:
-  """Wrapper that provides versions of a function with and without summaries.
-
-  This is a utility class for implementing optimized summary recording via a
-  two-function approach, specifically important for TPUs. Two `tf.function`
-  versions of a given `function` are created: one with soft device placement
-  enabled (for use on steps that require summary writing), and one with summary
-  writing and soft device placement entirely disabled (for use on all other
-  steps). This removes any performance impact of summaries on steps where they
-  aren't recorded (b/148418718).
-
-  This class can be used as a base class to implement summary optimizations for
-  a function with a specific signature. For example, to implement efficient TPU
-  summaries for a standard `train()` method (as in `orbit.AbstractTrainer`):
-
-      class TrainFunctionWithSummaries(orbit.utils.OptionalSummariesFunction):
-        '''Implements a two-program approach for summaries on TPU.'''
-
-        def __call__(self, num_steps):
-          if tf.summary.should_record_summaries():
-            output = self.with_summaries(tf.constant(1))
-            num_steps -= 1
-          if num_steps >= 1:
-            output = self.without_summaries(num_steps)
-          return output
-
-  This can be used directly or to implement a decorator:
-
-      def train_function_with_summaries(function=None, **kwargs):
-        if function is not None:
-          return TrainFunctionWithSummaries(function, **kwargs)
-        return functools.partial(TrainFunctionWithSummaries, **kwargs)
-
-  The decorator can be applied directly to `train()` methods:
-
-      @train_function_with_summaries
-      def train(self, num_steps):
-        ...
-
-  A similar approach approach can be implemented for functions with different
-  signatures.
-
-  Note: The above approach assumes that the frequency of summary writing is
-  based on a step interval that is divisible by the number of steps executed
-  in each call to the `train()` function. This is enforced by the
-  `orbit.Controller`.
-
-  This wrapper properly handles instance methods (see `__get__`).
-
-  Attributes:
-    with_summaries: A wrapped version of the underlying function with summaries
-      enabled (using whatever the active predicate is for
-      `tf.summary.record_if`), and placed inside a "soft device placement"
-      context to enable summary recording on TPU.
-    without_summaries: A wrapped version of the underlying function with all
-      summary recording disabled.
-  """
-
-  def __init__(self, function, **tf_function_kwargs):
-    """Constructs an instance wrapping the given `function`.
-
-    The given `function` is wrapped twice: Once in a "soft device placement"
-    context (allowing summaries to also run on TPU), and once with summary
-    recording entirely disabled.
-
-    Both of these versions are compiled via `tf.function` (optionally using any
-    supplied `tf.function` settings), and made available as attributes.
-
-    Args:
-      function: The underlying function to wrap.
-      **tf_function_kwargs: Additional arguments to pass to `tf.function`.
-    """
-
-    @tf.function(**tf_function_kwargs)
-    @functools.wraps(function)
-    def with_summaries(*args, **kwargs):
-      with _soft_device_placement():
-        return function(*args, **kwargs)
-
-    @tf.function(**tf_function_kwargs)
-    @functools.wraps(function)
-    def without_summaries(*args, **kwargs):
-      with tf.summary.record_if(False):
-        return function(*args, **kwargs)
-
-    self.with_summaries = with_summaries
-    self.without_summaries = without_summaries
-
-  def __get__(self, instance, owner):
-    """Allows this class to be used to wrap methods as well as free functions.
-
-    For `tf.function` to work properly in all cases (e.g., when an
-    input_signature is specified), any `tf.function`-converted methods must be
-    properly bound to an instance if they are called as an instance method.
-
-    This is done by implementing this `__get__` method of the descriptor
-    protocol, and forwarding to the `__get__` method on the underlying
-    `tf.function`s.
-
-    Args:
-      instance: The instance to bind to.
-      owner: The class type of the instance.
-
-    Returns:
-      A new bound instance of `TpuDiscretionarySummariesFunctions`.
-    """
-    new = object.__new__(self.__class__)
-    # pytype: disable=attribute-error  # See b/162476201.
-    new.with_summaries = self.with_summaries.__get__(instance, owner)
-    new.without_summaries = self.without_summaries.__get__(instance, owner)
-    # pytype: enable=attribute-error
-    return new
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/tpu_summaries_test.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/tpu_summaries_test.py
deleted file mode 100644
index 5fd1dda5f1119172566a89c8426eccf60835e03e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/orbit/utils/tpu_summaries_test.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright 2021 The Orbit Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-"""Tests for orbit.utils.tpu_summaries."""
-
-import functools
-import os
-
-from orbit.utils import common
-from orbit.utils import tpu_summaries
-
-import tensorflow as tf
-
-
-class TrainFunctionWithSummaries(tpu_summaries.OptionalSummariesFunction):
-  """Implements a two-program approach for summaries on TPU."""
-
-  def __call__(self, num_steps):
-    if tf.summary.should_record_summaries():
-      output = self.with_summaries(tf.constant(1))
-      num_steps -= 1
-    if num_steps >= 1:
-      output = self.without_summaries(num_steps)
-    return output
-
-
-def train_function_with_summaries(function=None, **kwargs):
-  if function is not None:
-    return TrainFunctionWithSummaries(function, **kwargs)
-  return functools.partial(TrainFunctionWithSummaries, **kwargs)
-
-
-class DummyTrainer(tf.Module):
-
-  def __init__(self):
-    self.step_counter = common.create_global_step()
-
-  @train_function_with_summaries
-  def train_with_tpu_summary_optimization(self, num_steps):
-    for _ in tf.range(num_steps):
-      tf.summary.scalar("step", self.step_counter, step=self.step_counter)
-      self.step_counter.assign_add(1)
-    return self.step_counter
-
-  @train_function_with_summaries(
-      input_signature=[tf.TensorSpec((), dtype=tf.int32)])
-  def train_with_tpu_summary_optimization_and_input_signature(self, num_steps):
-    for _ in tf.range(num_steps):
-      tf.summary.scalar("step", self.step_counter, step=self.step_counter)
-      self.step_counter.assign_add(1)
-    return self.step_counter
-
-  def train_with_tpu_summary_optimization_no_decorator(self, num_steps):
-    for _ in tf.range(num_steps):
-      tf.summary.scalar("step", self.step_counter, step=self.step_counter)
-      self.step_counter.assign_add(1)
-    return self.step_counter
-
-
-class TpuSummariesTest(tf.test.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self.trainer = DummyTrainer()
-
-  def _get_events_from_logdir(self, logdir):
-    event_files = tf.io.gfile.listdir(logdir)
-    self.assertLen(event_files, 1)
-    path = os.path.join(logdir, event_files[0])
-    events = list(tf.compat.v1.train.summary_iterator(path))
-    return [event for event in events if event.WhichOneof("what") == "summary"]
-
-  def _validate_tpu_summary_optimization(self, function, *args, **kwargs):
-    logdir = self.get_temp_dir()
-    with tf.summary.create_file_writer(logdir).as_default():
-      with tf.summary.record_if(lambda: self.trainer.step_counter % 20 == 0):
-        for _ in range(4):
-          output = function(tf.constant(10), *args, **kwargs)
-    events = self._get_events_from_logdir(logdir)
-    self.assertLen(events, 2)
-    self.assertEqual(events[0].step, 0)
-    self.assertEqual(events[1].step, 20)
-    return output
-
-  def test_train_with_tpu_summary_optimization(self):
-    output = self._validate_tpu_summary_optimization(
-        self.trainer.train_with_tpu_summary_optimization)
-    self.assertEqual(output, self.trainer.step_counter.numpy())
-
-  def test_train_with_tpu_summary_optimization_no_decorator(self):
-    optimized = train_function_with_summaries(
-        self.trainer.train_with_tpu_summary_optimization_no_decorator)
-    output = self._validate_tpu_summary_optimization(optimized)
-    self.assertEqual(output, self.trainer.step_counter.numpy())
-
-  def test_train_with_tpu_summary_optimization_and_input_signature(self):
-    output = self._validate_tpu_summary_optimization(
-        self.trainer.train_with_tpu_summary_optimization_and_input_signature)
-    self.assertEqual(output, self.trainer.step_counter.numpy())
-    function = self.trainer.train_with_tpu_summary_optimization_and_input_signature
-    expected = (tf.TensorSpec((), dtype=tf.int32),)
-    input_signature = function.with_summaries.input_signature
-    self.assertEqual(input_signature, expected)
-    input_signature = function.without_summaries.input_signature
-    self.assertEqual(input_signature, expected)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/requirements.txt b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/requirements.txt
deleted file mode 100644
index 655d75c1bd9e6aab753fc3ff8162c3e49a7abb32..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorflow==2.6.2
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/scripts/README.md b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/scripts/README.md
deleted file mode 100644
index fecd00cb23cbf1d412af924e381b4c96956c11d6..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/scripts/README.md
+++ /dev/null
@@ -1,260 +0,0 @@
-BERT (Bidirectional Encoder Representations from Transformers)
-WARNING: We are on the way to deprecate most of the code in this directory. Please see this link for the new tutorial and use the new code in nlp/modeling. This README is still correct for this legacy implementation.
-
-The academic paper which describes BERT in detail and provides full results on a number of tasks can be found here: https://arxiv.org/abs/1810.04805.
-
-This repository contains TensorFlow 2.x implementation for BERT.
-
-Contents
-Contents
-Pre-trained Models
-Restoring from Checkpoints
-Set Up
-Process Datasets
-Fine-tuning with BERT
-Cloud GPUs and TPUs
-Sentence and Sentence-pair Classification Tasks
-SQuAD 1.1
-Pre-trained Models
-We released both checkpoints and tf.hub modules as the pretrained models for fine-tuning. They are TF 2.x compatible and are converted from the checkpoints released in TF 1.x official BERT repository google-research/bert in order to keep consistent with BERT paper.
-
-Access to Pretrained Checkpoints
-Pretrained checkpoints can be found in the following links:
-
-Note: We have switched BERT implementation to use Keras functional-style networks in nlp/modeling. The new checkpoints are:
-
-BERT-Large, Uncased (Whole Word Masking): 24-layer, 1024-hidden, 16-heads, 340M parameters
-BERT-Large, Cased (Whole Word Masking): 24-layer, 1024-hidden, 16-heads, 340M parameters
-BERT-Base, Uncased: 12-layer, 768-hidden, 12-heads, 110M parameters
-BERT-Large, Uncased: 24-layer, 1024-hidden, 16-heads, 340M parameters
-BERT-Base, Cased: 12-layer, 768-hidden, 12-heads , 110M parameters
-BERT-Large, Cased: 24-layer, 1024-hidden, 16-heads, 340M parameters
-BERT-Base, Multilingual Cased: 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-We recommend to host checkpoints on Google Cloud storage buckets when you use Cloud GPU/TPU.
-
-Restoring from Checkpoints
-tf.train.Checkpoint is used to manage model checkpoints in TF 2. To restore weights from provided pre-trained checkpoints, you can use the following code:
-
-init_checkpoint='the pretrained model checkpoint path.'
-model=tf.keras.Model() # Bert pre-trained model as feature extractor.
-checkpoint = tf.train.Checkpoint(model=model)
-checkpoint.restore(init_checkpoint)
-Checkpoints featuring native serialized Keras models (i.e. model.load()/load_weights()) will be available soon.
-
-Access to Pretrained hub modules.
-Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the following links:
-
-BERT-Large, Uncased (Whole Word Masking): 24-layer, 1024-hidden, 16-heads, 340M parameters
-BERT-Large, Cased (Whole Word Masking): 24-layer, 1024-hidden, 16-heads, 340M parameters
-BERT-Base, Uncased: 12-layer, 768-hidden, 12-heads, 110M parameters
-BERT-Large, Uncased: 24-layer, 1024-hidden, 16-heads, 340M parameters
-BERT-Base, Cased: 12-layer, 768-hidden, 12-heads , 110M parameters
-BERT-Large, Cased: 24-layer, 1024-hidden, 16-heads, 340M parameters
-BERT-Base, Multilingual Cased: 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-BERT-Base, Chinese: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
-Set Up
-export PYTHONPATH="$PYTHONPATH:/path/to/models"
-Install tf-nightly to get latest updates:
-
-pip install tf-nightly-gpu
-With TPU, GPU support is not necessary. First, you need to create a tf-nightly TPU with ctpu tool:
-
-ctpu up -name <instance name> --tf-version=”nightly”
-Second, you need to install TF 2 tf-nightly on your VM:
-
-pip install tf-nightly
-Process Datasets
-Pre-training
-There is no change to generate pre-training data. Please use the script ../data/create_pretraining_data.py which is essentially branched from BERT research repo to get processed pre-training data and it adapts to TF2 symbols and python3 compatibility.
-
-Running the pre-training script requires an input and output directory, as well as a vocab file. Note that max_seq_length will need to match the sequence length parameter you specify when you run pre-training.
-
-Example shell script to call create_pretraining_data.py
-
-export WORKING_DIR='local disk or cloud location'
-export BERT_DIR='local disk or cloud location'
-python models/official/nlp/data/create_pretraining_data.py \
-  --input_file=$WORKING_DIR/input/input.txt \
-  --output_file=$WORKING_DIR/output/tf_examples.tfrecord \
-  --vocab_file=$BERT_DIR/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
-  --do_lower_case=True \
-  --max_seq_length=512 \
-  --max_predictions_per_seq=76 \
-  --masked_lm_prob=0.15 \
-  --random_seed=12345 \
-  --dupe_factor=5
-Fine-tuning
-To prepare the fine-tuning data for final model training, use the ../data/create_finetuning_data.py script. Resulting datasets in tf_record format and training meta data should be later passed to training or evaluation scripts. The task-specific arguments are described in following sections:
-
-GLUE
-Users can download the GLUE data by running this script and unpack it to some directory $GLUE_DIR. Also, users can download Pretrained Checkpoint and locate on some directory $BERT_DIR instead of using checkpoints on Google Cloud Storage.
-
-export GLUE_DIR=~/glue
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-
-export TASK_NAME=MNLI
-export OUTPUT_DIR=gs://some_bucket/datasets
-python ../data/create_finetuning_data.py \
- --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
- --vocab_file=${BERT_DIR}/vocab.txt \
- --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
- --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
- --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
- --fine_tuning_task_type=classification --max_seq_length=128 \
- --classification_task_name=${TASK_NAME}
-SQUAD
-The SQuAD website contains detailed information about the SQuAD datasets and evaluation.
-
-The necessary files can be found here:
-
-train-v1.1.json
-dev-v1.1.json
-evaluate-v1.1.py
-train-v2.0.json
-dev-v2.0.json
-evaluate-v2.0.py
-export SQUAD_DIR=~/squad
-export SQUAD_VERSION=v1.1
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export OUTPUT_DIR=gs://some_bucket/datasets
-
-python ../data/create_finetuning_data.py \
- --squad_data_file=${SQUAD_DIR}/train-${SQUAD_VERSION}.json \
- --vocab_file=${BERT_DIR}/vocab.txt \
- --train_data_output_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
- --meta_data_file_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_meta_data \
- --fine_tuning_task_type=squad --max_seq_length=384
-Note: To create fine-tuning data with SQUAD 2.0, you need to add flag --version_2_with_negative=True.
-
-Fine-tuning with BERT
-Cloud GPUs and TPUs
-Cloud Storage
-The unzipped pre-trained model files can also be found in the Google Cloud Storage folder gs://cloud-tpu-checkpoints/bert/keras_bert. For example:
-
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export MODEL_DIR=gs://some_bucket/my_output_dir
-Currently, users are able to access to tf-nightly TPUs and the following TPU script should run with tf-nightly.
-
-GPU -> TPU
-Just add the following flags to run_classifier.py or run_squad.py:
-
-  --distribution_strategy=tpu
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-Sentence and Sentence-pair Classification Tasks
-This example code fine-tunes BERT-Large on the Microsoft Research Paraphrase Corpus (MRPC) corpus, which only contains 3,600 examples and can fine-tune in a few minutes on most GPUs.
-
-We use the BERT-Large (uncased_L-24_H-1024_A-16) as an example throughout the workflow. For GPU memory of 16GB or smaller, you may try to use BERT-Base (uncased_L-12_H-768_A-12).
-
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export GLUE_DIR=gs://some_bucket/datasets
-export TASK=MRPC
-
-python run_classifier.py \
-  --mode='train_and_eval' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=4 \
-  --eval_batch_size=4 \
-  --steps_per_loop=1 \
-  --learning_rate=2e-5 \
-  --num_train_epochs=3 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
-Alternatively, instead of specifying init_checkpoint, you can specify hub_module_url to employ a pretraind BERT hub module, e.g., --hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1.
-
-After training a model, to get predictions from the classifier, you can set the --mode=predict and offer the test set tfrecords to --eval_data_path. Output will be created in file called test_results.tsv in the output folder. Each line will contain output for each sample, columns are the class probabilities.
-
-python run_classifier.py \
-  --mode='predict' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --eval_batch_size=4 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
-To use TPU, you only need to switch distribution strategy type to tpu with TPU information and use remote storage for model checkpoints.
-
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export TPU_IP_ADDRESS='???'
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export GLUE_DIR=gs://some_bucket/datasets
-export TASK=MRPC
-
-python run_classifier.py \
-  --mode='train_and_eval' \
-  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
-  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
-  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=32 \
-  --eval_batch_size=32 \
-  --steps_per_loop=1000 \
-  --learning_rate=2e-5 \
-  --num_train_epochs=3 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=tpu \
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-Note that, we specify steps_per_loop=1000 for TPU, because running a loop of training steps inside a tf.function can significantly increase TPU utilization and callbacks will not be called inside the loop.
-
-SQuAD 1.1
-The Stanford Question Answering Dataset (SQuAD) is a popular question answering benchmark dataset. See more in SQuAD website.
-
-We use the BERT-Large (uncased_L-24_H-1024_A-16) as an example throughout the workflow. For GPU memory of 16GB or smaller, you may try to use BERT-Base (uncased_L-12_H-768_A-12).
-
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export SQUAD_DIR=gs://some_bucket/datasets
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export SQUAD_VERSION=v1.1
-
-python run_squad.py \
-  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
-  --vocab_file=${BERT_DIR}/vocab.txt \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=4 \
-  --predict_batch_size=4 \
-  --learning_rate=8e-5 \
-  --num_train_epochs=2 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
-Similarily, you can replace init_checkpoint FLAG with hub_module_url to specify a hub module path.
-
-run_squad.py writes the prediction for --predict_file by default. If you set the --model=predict and offer the SQuAD test data, the scripts will generate the prediction json file.
-
-To use TPU, you need switch distribution strategy type to tpu with TPU information.
-
-export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
-export TPU_IP_ADDRESS='???'
-export MODEL_DIR=gs://some_bucket/my_output_dir
-export SQUAD_DIR=gs://some_bucket/datasets
-export SQUAD_VERSION=v1.1
-
-python run_squad.py \
-  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
-  --vocab_file=${BERT_DIR}/vocab.txt \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=32 \
-  --learning_rate=8e-5 \
-  --num_train_epochs=2 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=tpu \
-  --tpu=grpc://${TPU_IP_ADDRESS}:8470
-The dev set predictions will be saved into a file called predictions.json in the model_dir:
-
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
-
------------------------------------------------------------------------------------------------------------------------------
-训练预测通过执行run_squad.py文件，选择具体的MODE,修改MODE_DIR(init_ckpt)和SQUAD_DIR(data)。预测文件与dev1.1.json文件做对比。获取f1-score。
-此模型可用其官方下载的bert模型做init_ckpt，也可以用我们训练后的bert主干网络做迁移，或者直接预训练。
-训练时调整命令行中的num_gpus和相应的batch_size值，单卡可设置batch_size=4.
-源码下载后 grep -rn "checkpoint.re" ./*搜索所有的checkpoint.restore和checkpoint.read，在后面加上.expect_partial()方法，不然程序运行报错。
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/scripts/run_squad.py b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/scripts/run_squad.py
deleted file mode 100644
index 00dbac34b056d87f72850a1f75af63d44e7a3e42..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/scripts/run_squad.py
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ============================================================================
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-export PYTHONPATH="$PYTHONPATH:/home/twx1048908/workspace/models"
-
-SQUAD_DIR=/
-SQUAD_VERSION=v1.1
-
-python run_squad.py \
-  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
-  --vocab_file=${BERT_DIR}/vocab.txt \
-  --bert_config_file=${BERT_DIR}/bert_config.json \
-  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
-  --train_batch_size=4 \
-  --predict_batch_size=4 \
-  --learning_rate=8e-5 \
-  --num_train_epochs=2 \
-  --model_dir=${MODEL_DIR} \
-  --distribution_strategy=mirrored
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/env.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/env.sh
deleted file mode 100644
index 7c219cdf8fb31ce21e47d59ff8914c38d5c50fe5..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/env.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-cur_path=`pwd`/../
-export install_path=/usr/local/Ascend 
-#export LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH # 仅容器训练场景配置
-export PATH=${install_path}/fwkacllib/ccec_compiler/bin:${install_path}/fwkacllib/bin:$PATH
-export LD_LIBRARY_PATH=${install_path}/fwkacllib/lib64:$LD_LIBRARY_PATH
-export PYTHONPATH=${install_path}/fwkacllib/python/site-packages:$PYTHONPATH
-export PYTHONPATH=/usr/local/python3.7.5/lib/python3.7/site-packages:${install_path}/tfplugin/python/site-packages:$PYTHONPATH
-export ASCEND_OPP_PATH=${install_path}/opp
-export ASCEND_AICPU_PATH=${install_path}
-#export PYTHONPATH=$cur_path/models/research:$cur_path/models/research/slim:$PYTHONPATH
-export JOB_ID=10087
-export ASCEND_GLOBAL_LOG_LEVEL=3
-export ASCEND_DEVICE_ID=0
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_1p.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_1p.sh
deleted file mode 100644
index 33149a9ac89141ccf6d27c4d6b6e6b9ccb05b21a..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_1p.sh
+++ /dev/null
@@ -1,191 +0,0 @@
-#!/bin/bash
-
-cur_path=`pwd`/..
-#失败用例打屏
-#export ASCEND_SLOG_PRINT_TO_STDOUT=1
-#基础参数，需要模型审视修改
-#Batch Size
-batch_size=16
-#网络名称，同目录名称
-Network="bert-squad_ID1566_for_TensorFlow2.X"
-#Device数量，单卡默认为1
-RankSize=1
-export RANK_SIZE=1
-#性能优化
-export NPU_LOOP_SIZE=1
-#训练epoch，可选
-train_epochs=2
-#训练step
-train_steps=300
-#学习率
-learning_rate=5e-5
-ckpt_path=""
-#参数配置
-
-# 数据集路径,保持为空,不需要修改
-data_path=""
-############维测参数##############
-precision_mode="allow_mix_precision"
-#维持参数，以下不需要修改
-over_dump=False
-if [[ $over_dump == True ]];then
-    over_dump_path=${cur_path}/test/overflow_dump
-    mkdir -p ${over_dump_path}
-fi
-data_dump_flag=False
-data_dump_step="10"
-profiling=False
-use_mixlist=False
-mixlist_file=${cur_path}/configs/ops_info.json
-fusion_off_flag=False
-fusion_off_file=${cur_path}/configs/fusion_switch.cfg
-auto_tune=False
-############维测参数##############
-
-
-if [[ $1 == --help || $1 == --h ]];then
-   echo "usage:./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation""
-   exit 1
-fi
-
-############维测参数##############
-for para in $*
-do
-    if [[ $para == --precision_mode* ]];then
-        precision_mode=`echo ${para#*=}`
-    elif [[ $para == --over_dump* ]];then
-        over_dump=`echo ${para#*=}`
-        over_dump_path=${cur_path}test/output/overflow_dump
-        mkdir -p ${over_dump_path}
-    elif [[ $para == --data_dump_flag* ]];then
-        data_dump_flag=`echo ${para#*=}`
-        data_dump_path=${cur_path}test/output/data_dump
-        mkdir -p ${data_dump_path}
-    elif [[ $para == --data_dump_step* ]];then
-        data_dump_step=`echo ${para#*=}`
-    elif [[ $para == --profiling* ]];then
-        profiling=`echo ${para#*=}`
-        profiling_dump_path=${cur_path}test/output/profiling
-        mkdir -p ${profiling_dump_path}
-    elif [[ $para == --data_path* ]];then
-        data_path=`echo ${para#*=}`
-    elif [[ $para == --ckpt_path* ]];then
-        ckpt_path=`echo ${para#*=}`
-    elif [[ $para == --use_mixlist* ]];then
-        use_mixlist=`echo ${para#*=}`
-    elif [[ $para == --mixlist_file* ]];then
-        mixlist_file=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_flag* ]];then
-        fusion_off_flag=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_file* ]];then
-        fusion_off_file=`echo ${para#*=}`
-    elif [[ $para == --auto_tune* ]];then
-        auto_tune=`echo ${para#*=}`
-    fi
-done
-############维测参数##############
-
-
-if [[ $data_path  == "" ]];then
-   echo "[Error] para \"data_path\" must be config"
-   exit 1
-fi
-if [[ $ckpt_path  == "" ]];then
-   echo "[Error] para \"ckpt_path\" must be config"
-   exit 1
-fi
-##############执行训练##########
-cd $cur_path
-if [ -d $cur_path/test/output ];then
-   rm -rf $cur_path/test/output/*
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-else
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-fi
-wait
-
-export PYTHONPATH="$PYTHONPATH:$cur_path"
-SQUAD_VERSION=v1.1
-MODE=train_and_eval
-#MODE=train
-start=$(date +%s)
-nohup python3 ./official/nlp/bert/run_squad.py \
-   --mode=${MODE} \
-   --input_meta_data_path=${data_path}/squad_${SQUAD_VERSION}_meta_data \
-   --train_data_path=${data_path}/squad_${SQUAD_VERSION}_train.tf_record \
-   --predict_file=${data_path}/dev-v1.1.json \
-   --vocab_file=${data_path}/vocab.txt \
-   --bert_config_file=${data_path}/bert_config.json \
-   --init_checkpoint=${ckpt_path}/bert_model.ckpt \
-   --train_batch_size=${batch_size} \
-   --optimizer_type=lamb \
-   --learning_rate=${learning_rate} \
-   --num_train_epochs=${train_epochs} \
-   --model_dir=model_dir \
-   --log_steps=1 \
-   --steps_per_loop=1 \
-   --num_gpus=1 \
-   --distribution_strategy=one_device \
-   --sub_model_export_name=sub_model \
-   --precision_mode=${precision_mode} \
-   --over_dump=${over_dump} \
-   --over_dump_path=${over_dump_path} \
-   --data_dump_flag=${data_dump_flag} \
-   --data_dump_step=${data_dump_step} \
-   --data_dump_path=${data_dump_path} \
-   --profiling=${profiling} \
-   --use_mixlist=${use_mixlist} \
-   --fusion_off_flag=${fusion_off_flag} \
-   --mixlist_file=${mixlist_file} \
-   --fusion_off_file=${fusion_off_file} \
-   --profiling_dump_path=${profiling_dump_path} \
-   --auto_tune=${auto_tune} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
-wait
-end=$(date +%s)
-e2e_time=$(( $end - $start ))
-
-#echo "Final Performance ms/step : $average_perf"
-echo "Final Training Duration sec : $e2e_time"
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-FPS=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $8}'`
-wait
-
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-
-#输出训练精度,需要模型审视修改
-train_accuracy=`grep SQuAD $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'`
-#打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
-
-#稳定性精度看护结果汇总
-#训练用例信息，不需要修改
-BatchSize=${batch_size}
-DeviceType=`uname -m`
-CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualFPS=${FPS}
-#单迭代训练时长
-trainingTime=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $6}'`
-TrainingTime=`awk 'BEGIN{printf "%.2f",'$trainingTime'/10}'`
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep 'Train Step' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $11}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk '{print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|tail -n 1`
-
-echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad1.1_base_1p.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad1.1_base_1p.sh
deleted file mode 100644
index b32fe7c8f2d79ebf20224d23d211803e4ed0eacf..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad1.1_base_1p.sh
+++ /dev/null
@@ -1,189 +0,0 @@
-#!/bin/bash
-
-cur_path=`pwd`/..
-#失败用例打屏
-#export ASCEND_SLOG_PRINT_TO_STDOUT=1
-#基础参数，需要模型审视修改
-#Batch Size
-batch_size=4
-#网络名称，同目录名称
-Network="bert-squad_ID1566_for_TensorFlow2.X"
-#Device数量，单卡默认为1
-RankSize=1
-export RANK_SIZE=1
-#性能优化
-export NPU_LOOP_SIZE=200
-#训练epoch，可选
-train_epochs=2
-#学习率
-learning_rate=8e-5
-ckpt_path=""
-#参数配置
-
-# 数据集路径,保持为空,不需要修改
-data_path=""
-############维测参数##############
-precision_mode="allow_mix_precision"
-# precision_mode="allow_fp32_to_fp16"
-#维持参数，以下不需要修改
-over_dump=False
-if [[ $over_dump == True ]];then
-    over_dump_path=${cur_path}/test/overflow_dump
-    mkdir -p ${over_dump_path}
-fi
-data_dump_flag=False
-data_dump_step="10"
-profiling=False
-use_mixlist=True
-mixlist_file=${cur_path}/configs/ops_info.json
-fusion_off_flag=False
-fusion_off_file=${cur_path}/configs/fusion_switch.cfg
-auto_tune=False
-############维测参数##############
-
-
-if [[ $1 == --help || $1 == --h ]];then
-   echo "usage:./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation""
-   exit 1
-fi
-
-############维测参数##############
-for para in $*
-do
-    if [[ $para == --precision_mode* ]];then
-        precision_mode=`echo ${para#*=}`
-    elif [[ $para == --over_dump* ]];then
-        over_dump=`echo ${para#*=}`
-        over_dump_path=${cur_path}test/output/overflow_dump
-        mkdir -p ${over_dump_path}
-    elif [[ $para == --data_dump_flag* ]];then
-        data_dump_flag=`echo ${para#*=}`
-        data_dump_path=${cur_path}test/output/data_dump
-        mkdir -p ${data_dump_path}
-    elif [[ $para == --data_dump_step* ]];then
-        data_dump_step=`echo ${para#*=}`
-    elif [[ $para == --profiling* ]];then
-        profiling=`echo ${para#*=}`
-        profiling_dump_path=${cur_path}test/output/profiling
-        mkdir -p ${profiling_dump_path}
-    elif [[ $para == --data_path* ]];then
-        data_path=`echo ${para#*=}`
-    elif [[ $para == --ckpt_path* ]];then
-        ckpt_path=`echo ${para#*=}`
-    elif [[ $para == --use_mixlist* ]];then
-        use_mixlist=`echo ${para#*=}`
-    elif [[ $para == --mixlist_file* ]];then
-        mixlist_file=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_flag* ]];then
-        fusion_off_flag=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_file* ]];then
-        fusion_off_file=`echo ${para#*=}`
-    elif [[ $para == --auto_tune* ]];then
-        auto_tune=`echo ${para#*=}`
-    fi
-done
-############维测参数##############
-
-
-if [[ $data_path  == "" ]];then
-   echo "[Error] para \"data_path\" must be config"
-   exit 1
-fi
-
-##############执行训练##########
-cd $cur_path
-if [ -d $cur_path/test/output ];then
-   rm -rf $cur_path/test/output/*
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-else
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-fi
-wait
-
-export PYTHONPATH="$PYTHONPATH:$cur_path"
-SQUAD_VERSION=v1.1
-BERT_BASE_DIR=${data_path}/pretrained_weights/uncased_L-12_H-768_A-12/
-SQUAD_DIR=${data_path}/SQuAD/v1.1
-MODE=train_and_eval
-#MODE=train
-start=$(date +%s)
-nohup python3 ./official/nlp/bert/run_squad.py \
-   --mode=${MODE} \
-   --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-   --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-   --predict_file=${SQUAD_DIR}/dev-${SQUAD_VERSION}.json \
-   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
-   --init_checkpoint=${BERT_BASE_DIR}/bert_model.ckpt \
-   --train_batch_size=${batch_size} \
-   --optimizer_type=lamb \
-   --learning_rate=${learning_rate} \
-   --num_train_epochs=${train_epochs} \
-   --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \
-   --log_steps=200 \
-   --steps_per_loop=${NPU_LOOP_SIZE} \
-   --num_gpus=1 \
-   --distribution_strategy=one_device \
-   --sub_model_export_name=sub_model \
-   --precision_mode=${precision_mode} \
-   --over_dump=${over_dump} \
-   --over_dump_path=${over_dump_path} \
-   --data_dump_flag=${data_dump_flag} \
-   --data_dump_step=${data_dump_step} \
-   --data_dump_path=${data_dump_path} \
-   --profiling=${profiling} \
-   --use_mixlist=${use_mixlist} \
-   --fusion_off_flag=${fusion_off_flag} \
-   --mixlist_file=${mixlist_file} \
-   --fusion_off_file=${fusion_off_file} \
-   --profiling_dump_path=${profiling_dump_path} \
-   --auto_tune=${auto_tune} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
-wait
-end=$(date +%s)
-e2e_time=$(( $end - $start ))
-
-#echo "Final Performance ms/step : $average_perf"
-echo "Final Training Duration sec : $e2e_time"
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-FPS=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $8}'`
-wait
-
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-
-#输出训练精度,需要模型审视修改
-train_accuracy=`grep SQuAD $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'`
-#打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
-
-#稳定性精度看护结果汇总
-#训练用例信息，不需要修改
-BatchSize=${batch_size}
-DeviceType=`uname -m`
-CaseName=${Network}_squad1.1_base_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualFPS=${FPS}
-#单迭代训练时长
-trainingTime=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $6}'`
-TrainingTime=`awk 'BEGIN{printf "%.2f",'$trainingTime'/10}'`
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep 'Train Step' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $11}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk '{print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|tail -n 1`
-
-echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad1.1_large_1p.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad1.1_large_1p.sh
deleted file mode 100644
index 9ca67fd8c02b4893c2462664300d3103fc23480e..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad1.1_large_1p.sh
+++ /dev/null
@@ -1,189 +0,0 @@
-#!/bin/bash
-
-cur_path=`pwd`/..
-#失败用例打屏
-#export ASCEND_SLOG_PRINT_TO_STDOUT=1
-#基础参数，需要模型审视修改
-#Batch Size
-batch_size=24
-#网络名称，同目录名称
-Network="bert-squad_ID1566_for_TensorFlow2.X"
-#Device数量，单卡默认为1
-RankSize=1
-export RANK_SIZE=1
-#性能优化
-export NPU_LOOP_SIZE=200
-#训练epoch，可选
-train_epochs=2
-#学习率
-learning_rate=9e-5
-ckpt_path=""
-#参数配置
-
-# 数据集路径,保持为空,不需要修改
-data_path=""
-############维测参数##############
-precision_mode="allow_mix_precision"
-# precision_mode="allow_fp32_to_fp16"
-#维持参数，以下不需要修改
-over_dump=False
-if [[ $over_dump == True ]];then
-    over_dump_path=${cur_path}/test/overflow_dump
-    mkdir -p ${over_dump_path}
-fi
-data_dump_flag=False
-data_dump_step="10"
-profiling=False
-use_mixlist=False
-mixlist_file=${cur_path}/configs/ops_info.json
-fusion_off_flag=False
-fusion_off_file=${cur_path}/configs/fusion_switch.cfg
-auto_tune=False
-############维测参数##############
-
-
-if [[ $1 == --help || $1 == --h ]];then
-   echo "usage:./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation""
-   exit 1
-fi
-
-############维测参数##############
-for para in $*
-do
-    if [[ $para == --precision_mode* ]];then
-        precision_mode=`echo ${para#*=}`
-    elif [[ $para == --over_dump* ]];then
-        over_dump=`echo ${para#*=}`
-        over_dump_path=${cur_path}test/output/overflow_dump
-        mkdir -p ${over_dump_path}
-    elif [[ $para == --data_dump_flag* ]];then
-        data_dump_flag=`echo ${para#*=}`
-        data_dump_path=${cur_path}test/output/data_dump
-        mkdir -p ${data_dump_path}
-    elif [[ $para == --data_dump_step* ]];then
-        data_dump_step=`echo ${para#*=}`
-    elif [[ $para == --profiling* ]];then
-        profiling=`echo ${para#*=}`
-        profiling_dump_path=${cur_path}test/output/profiling
-        mkdir -p ${profiling_dump_path}
-    elif [[ $para == --data_path* ]];then
-        data_path=`echo ${para#*=}`
-    elif [[ $para == --ckpt_path* ]];then
-        ckpt_path=`echo ${para#*=}`
-    elif [[ $para == --use_mixlist* ]];then
-        use_mixlist=`echo ${para#*=}`
-    elif [[ $para == --mixlist_file* ]];then
-        mixlist_file=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_flag* ]];then
-        fusion_off_flag=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_file* ]];then
-        fusion_off_file=`echo ${para#*=}`
-    elif [[ $para == --auto_tune* ]];then
-        auto_tune=`echo ${para#*=}`
-    fi
-done
-############维测参数##############
-
-
-if [[ $data_path  == "" ]];then
-   echo "[Error] para \"data_path\" must be config"
-   exit 1
-fi
-
-##############执行训练##########
-cd $cur_path
-if [ -d $cur_path/test/output ];then
-   rm -rf $cur_path/test/output/*
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-else
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-fi
-wait
-
-export PYTHONPATH="$PYTHONPATH:$cur_path"
-SQUAD_VERSION=v1.1
-BERT_BASE_DIR=${data_path}/pretrained_weights/wwm_uncased_L-24_H-1024_A-16/
-SQUAD_DIR=${data_path}/SQuAD/v1.1
-MODE=train_and_eval
-#MODE=train
-start=$(date +%s)
-nohup python3 ./official/nlp/bert/run_squad.py \
-   --mode=${MODE} \
-   --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-   --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-   --predict_file=${SQUAD_DIR}/dev-${SQUAD_VERSION}.json \
-   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
-   --init_checkpoint=${BERT_BASE_DIR}/bert_model.ckpt \
-   --train_batch_size=${batch_size} \
-   --optimizer_type=lamb \
-   --learning_rate=${learning_rate} \
-   --num_train_epochs=${train_epochs} \
-   --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \
-   --log_steps=200 \
-   --steps_per_loop=${NPU_LOOP_SIZE} \
-   --num_gpus=1 \
-   --distribution_strategy=one_device \
-   --sub_model_export_name=sub_model \
-   --precision_mode=${precision_mode} \
-   --over_dump=${over_dump} \
-   --over_dump_path=${over_dump_path} \
-   --data_dump_flag=${data_dump_flag} \
-   --data_dump_step=${data_dump_step} \
-   --data_dump_path=${data_dump_path} \
-   --profiling=${profiling} \
-   --use_mixlist=${use_mixlist} \
-   --fusion_off_flag=${fusion_off_flag} \
-   --mixlist_file=${mixlist_file} \
-   --fusion_off_file=${fusion_off_file} \
-   --profiling_dump_path=${profiling_dump_path} \
-   --auto_tune=${auto_tune} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
-wait
-end=$(date +%s)
-e2e_time=$(( $end - $start ))
-
-#echo "Final Performance ms/step : $average_perf"
-echo "Final Training Duration sec : $e2e_time"
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-FPS=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $8}'`
-wait
-
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-
-#输出训练精度,需要模型审视修改
-train_accuracy=`grep SQuAD $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'`
-#打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
-
-#稳定性精度看护结果汇总
-#训练用例信息，不需要修改
-BatchSize=${batch_size}
-DeviceType=`uname -m`
-CaseName=${Network}_squad1.1_large_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualFPS=${FPS}
-#单迭代训练时长
-trainingTime=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $6}'`
-TrainingTime=`awk 'BEGIN{printf "%.2f",'$trainingTime'/10}'`
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep 'Train Step' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $11}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk '{print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|tail -n 1`
-
-echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad2.0_base_1p.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad2.0_base_1p.sh
deleted file mode 100644
index 9569b93006d1934988102b7309ff5b9623da4672..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad2.0_base_1p.sh
+++ /dev/null
@@ -1,188 +0,0 @@
-#!/bin/bash
-
-cur_path=`pwd`/..
-#失败用例打屏
-#export ASCEND_SLOG_PRINT_TO_STDOUT=1
-#基础参数，需要模型审视修改
-#Batch Size
-batch_size=4
-#网络名称，同目录名称
-Network="bert-squad_ID1566_for_TensorFlow2.X"
-#Device数量，单卡默认为1
-RankSize=1
-export RANK_SIZE=1
-#性能优化
-export NPU_LOOP_SIZE=200
-#训练epoch，可选
-train_epochs=2
-#学习率
-learning_rate=8e-5
-ckpt_path=""
-#参数配置
-
-# 数据集路径,保持为空,不需要修改
-data_path=""
-############维测参数##############
-precision_mode="allow_mix_precision"
-#维持参数，以下不需要修改
-over_dump=False
-if [[ $over_dump == True ]];then
-    over_dump_path=${cur_path}/test/overflow_dump
-    mkdir -p ${over_dump_path}
-fi
-data_dump_flag=False
-data_dump_step="10"
-profiling=False
-use_mixlist=True
-mixlist_file=${cur_path}/configs/ops_info.json
-fusion_off_flag=False
-fusion_off_file=${cur_path}/configs/fusion_switch.cfg
-auto_tune=False
-############维测参数##############
-
-
-if [[ $1 == --help || $1 == --h ]];then
-   echo "usage:./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation""
-   exit 1
-fi
-
-############维测参数##############
-for para in $*
-do
-    if [[ $para == --precision_mode* ]];then
-        precision_mode=`echo ${para#*=}`
-    elif [[ $para == --over_dump* ]];then
-        over_dump=`echo ${para#*=}`
-        over_dump_path=${cur_path}test/output/overflow_dump
-        mkdir -p ${over_dump_path}
-    elif [[ $para == --data_dump_flag* ]];then
-        data_dump_flag=`echo ${para#*=}`
-        data_dump_path=${cur_path}test/output/data_dump
-        mkdir -p ${data_dump_path}
-    elif [[ $para == --data_dump_step* ]];then
-        data_dump_step=`echo ${para#*=}`
-    elif [[ $para == --profiling* ]];then
-        profiling=`echo ${para#*=}`
-        profiling_dump_path=${cur_path}test/output/profiling
-        mkdir -p ${profiling_dump_path}
-    elif [[ $para == --data_path* ]];then
-        data_path=`echo ${para#*=}`
-    elif [[ $para == --ckpt_path* ]];then
-        ckpt_path=`echo ${para#*=}`
-    elif [[ $para == --use_mixlist* ]];then
-        use_mixlist=`echo ${para#*=}`
-    elif [[ $para == --mixlist_file* ]];then
-        mixlist_file=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_flag* ]];then
-        fusion_off_flag=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_file* ]];then
-        fusion_off_file=`echo ${para#*=}`
-    elif [[ $para == --auto_tune* ]];then
-        auto_tune=`echo ${para#*=}`
-    fi
-done
-############维测参数##############
-
-
-if [[ $data_path  == "" ]];then
-   echo "[Error] para \"data_path\" must be config"
-   exit 1
-fi
-
-##############执行训练##########
-cd $cur_path
-if [ -d $cur_path/test/output ];then
-   rm -rf $cur_path/test/output/*
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-else
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-fi
-wait
-
-export PYTHONPATH="$PYTHONPATH:$cur_path"
-SQUAD_VERSION=v2.0
-BERT_BASE_DIR=${data_path}/pretrained_weights/uncased_L-12_H-768_A-12/
-SQUAD_DIR=${data_path}/SQuAD/v2.0
-MODE=train_and_eval
-#MODE=train
-start=$(date +%s)
-nohup python3 ./official/nlp/bert/run_squad.py \
-   --mode=${MODE} \
-   --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-   --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-   --predict_file=${SQUAD_DIR}/dev-${SQUAD_VERSION}.json \
-   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
-   --init_checkpoint=${BERT_BASE_DIR}/bert_model.ckpt \
-   --train_batch_size=${batch_size} \
-   --optimizer_type=lamb \
-   --learning_rate=${learning_rate} \
-   --num_train_epochs=${train_epochs} \
-   --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \
-   --log_steps=200 \
-   --steps_per_loop=${NPU_LOOP_SIZE} \
-   --num_gpus=1 \
-   --distribution_strategy=one_device \
-   --sub_model_export_name=sub_model \
-   --precision_mode=${precision_mode} \
-   --over_dump=${over_dump} \
-   --over_dump_path=${over_dump_path} \
-   --data_dump_flag=${data_dump_flag} \
-   --data_dump_step=${data_dump_step} \
-   --data_dump_path=${data_dump_path} \
-   --profiling=${profiling} \
-   --use_mixlist=${use_mixlist} \
-   --fusion_off_flag=${fusion_off_flag} \
-   --mixlist_file=${mixlist_file} \
-   --fusion_off_file=${fusion_off_file} \
-   --profiling_dump_path=${profiling_dump_path} \
-   --auto_tune=${auto_tune} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
-wait
-end=$(date +%s)
-e2e_time=$(( $end - $start ))
-
-#echo "Final Performance ms/step : $average_perf"
-echo "Final Training Duration sec : $e2e_time"
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-FPS=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $8}'`
-wait
-
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-
-#输出训练精度,需要模型审视修改
-train_accuracy=`grep SQuAD $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'`
-#打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
-
-#稳定性精度看护结果汇总
-#训练用例信息，不需要修改
-BatchSize=${batch_size}
-DeviceType=`uname -m`
-CaseName=${Network}_squad2.0_base_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualFPS=${FPS}
-#单迭代训练时长
-trainingTime=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $6}'`
-TrainingTime=`awk 'BEGIN{printf "%.2f",'$trainingTime'/10}'`
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep 'Train Step' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $11}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk '{print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|tail -n 1`
-
-echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad2.0_large_1p.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad2.0_large_1p.sh
deleted file mode 100644
index 1a6b069dc3ab9bd217d9c3df06bcd585ca49d1d1..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad2.0_large_1p.sh
+++ /dev/null
@@ -1,188 +0,0 @@
-#!/bin/bash
-
-cur_path=`pwd`/..
-#失败用例打屏
-#export ASCEND_SLOG_PRINT_TO_STDOUT=1
-#基础参数，需要模型审视修改
-#Batch Size
-batch_size=24
-#网络名称，同目录名称
-Network="bert-squad_ID1566_for_TensorFlow2.X"
-#Device数量，单卡默认为1
-RankSize=1
-export RANK_SIZE=1
-#性能优化
-export NPU_LOOP_SIZE=200
-#训练epoch，可选
-train_epochs=2
-#学习率
-learning_rate=9e-5
-ckpt_path=""
-#参数配置
-
-# 数据集路径,保持为空,不需要修改
-data_path=""
-############维测参数##############
-precision_mode="allow_mix_precision"
-#维持参数，以下不需要修改
-over_dump=False
-if [[ $over_dump == True ]];then
-    over_dump_path=${cur_path}/test/overflow_dump
-    mkdir -p ${over_dump_path}
-fi
-data_dump_flag=False
-data_dump_step="10"
-profiling=False
-use_mixlist=False
-mixlist_file=${cur_path}/configs/ops_info.json
-fusion_off_flag=False
-fusion_off_file=${cur_path}/configs/fusion_switch.cfg
-auto_tune=False
-############维测参数##############
-
-
-if [[ $1 == --help || $1 == --h ]];then
-   echo "usage:./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation""
-   exit 1
-fi
-
-############维测参数##############
-for para in $*
-do
-    if [[ $para == --precision_mode* ]];then
-        precision_mode=`echo ${para#*=}`
-    elif [[ $para == --over_dump* ]];then
-        over_dump=`echo ${para#*=}`
-        over_dump_path=${cur_path}test/output/overflow_dump
-        mkdir -p ${over_dump_path}
-    elif [[ $para == --data_dump_flag* ]];then
-        data_dump_flag=`echo ${para#*=}`
-        data_dump_path=${cur_path}test/output/data_dump
-        mkdir -p ${data_dump_path}
-    elif [[ $para == --data_dump_step* ]];then
-        data_dump_step=`echo ${para#*=}`
-    elif [[ $para == --profiling* ]];then
-        profiling=`echo ${para#*=}`
-        profiling_dump_path=${cur_path}test/output/profiling
-        mkdir -p ${profiling_dump_path}
-    elif [[ $para == --data_path* ]];then
-        data_path=`echo ${para#*=}`
-    elif [[ $para == --ckpt_path* ]];then
-        ckpt_path=`echo ${para#*=}`
-    elif [[ $para == --use_mixlist* ]];then
-        use_mixlist=`echo ${para#*=}`
-    elif [[ $para == --mixlist_file* ]];then
-        mixlist_file=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_flag* ]];then
-        fusion_off_flag=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_file* ]];then
-        fusion_off_file=`echo ${para#*=}`
-    elif [[ $para == --auto_tune* ]];then
-        auto_tune=`echo ${para#*=}`
-    fi
-done
-############维测参数##############
-
-
-if [[ $data_path  == "" ]];then
-   echo "[Error] para \"data_path\" must be config"
-   exit 1
-fi
-
-##############执行训练##########
-cd $cur_path
-if [ -d $cur_path/test/output ];then
-   rm -rf $cur_path/test/output/*
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-else
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-fi
-wait
-
-export PYTHONPATH="$PYTHONPATH:$cur_path"
-SQUAD_VERSION=v2.0
-BERT_BASE_DIR=${data_path}/pretrained_weights/wwm_uncased_L-24_H-1024_A-16/
-SQUAD_DIR=${data_path}/SQuAD/v2.0
-MODE=train_and_eval
-#MODE=train
-start=$(date +%s)
-nohup python3 ./official/nlp/bert/run_squad.py \
-   --mode=${MODE} \
-   --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-   --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-   --predict_file=${SQUAD_DIR}/dev-${SQUAD_VERSION}.json \
-   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
-   --init_checkpoint=${BERT_BASE_DIR}/bert_model.ckpt \
-   --train_batch_size=${batch_size} \
-   --optimizer_type=lamb \
-   --learning_rate=${learning_rate} \
-   --num_train_epochs=${train_epochs} \
-   --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \
-   --log_steps=200 \
-   --steps_per_loop=${NPU_LOOP_SIZE} \
-   --num_gpus=1 \
-   --distribution_strategy=one_device \
-   --sub_model_export_name=sub_model \
-   --precision_mode=${precision_mode} \
-   --over_dump=${over_dump} \
-   --over_dump_path=${over_dump_path} \
-   --data_dump_flag=${data_dump_flag} \
-   --data_dump_step=${data_dump_step} \
-   --data_dump_path=${data_dump_path} \
-   --profiling=${profiling} \
-   --use_mixlist=${use_mixlist} \
-   --fusion_off_flag=${fusion_off_flag} \
-   --mixlist_file=${mixlist_file} \
-   --fusion_off_file=${fusion_off_file} \
-   --profiling_dump_path=${profiling_dump_path} \
-   --auto_tune=${auto_tune} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
-wait
-end=$(date +%s)
-e2e_time=$(( $end - $start ))
-
-#echo "Final Performance ms/step : $average_perf"
-echo "Final Training Duration sec : $e2e_time"
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-FPS=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $8}'`
-wait
-
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-
-#输出训练精度,需要模型审视修改
-train_accuracy=`grep SQuAD $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'`
-#打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
-
-#稳定性精度看护结果汇总
-#训练用例信息，不需要修改
-BatchSize=${batch_size}
-DeviceType=`uname -m`
-CaseName=${Network}_squad2.0_large_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualFPS=${FPS}
-#单迭代训练时长
-trainingTime=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $6}'`
-TrainingTime=`awk 'BEGIN{printf "%.2f",'$trainingTime'/10}'`
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep 'Train Step' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $11}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk '{print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|tail -n 1`
-
-echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_1p.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_1p.sh
deleted file mode 100644
index 29b4d082a12817c6d9163fd448c47722f94d3a42..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_1p.sh
+++ /dev/null
@@ -1,189 +0,0 @@
-#!/bin/bash
-
-cur_path=`pwd`/..
-#失败用例打屏
-#export ASCEND_SLOG_PRINT_TO_STDOUT=1
-#基础参数，需要模型审视修改
-#Batch Size
-batch_size=16
-#网络名称，同目录名称
-Network="bert-squad_ID1566_for_TensorFlow2.X"
-#Device数量，单卡默认为1
-RankSize=1
-export RANK_SIZE=1
-#性能优化
-export NPU_LOOP_SIZE=1
-#训练epoch，可选
-train_epochs=1
-#训练step
-#train_steps=300
-train_steps=50
-#学习率
-learning_rate=5e-5
-ckpt_path=""
-#参数配置
-
-# 数据集路径,保持为空,不需要修改
-data_path=""
-############维测参数##############
-precision_mode="allow_mix_precision"
-#维持参数，以下不需要修改
-over_dump=False
-if [[ $over_dump == True ]];then
-    over_dump_path=${cur_path}/test/overflow_dump
-    mkdir -p ${over_dump_path}
-fi
-data_dump_flag=False
-data_dump_step="10"
-profiling=False
-use_mixlist=False
-mixlist_file=${cur_path}/configs/ops_info.json
-fusion_off_flag=False
-fusion_off_file=${cur_path}/configs/fusion_switch.cfg
-auto_tune=False
-############维测参数##############
-
-
-if [[ $1 == --help || $1 == --h ]];then
-   echo "usage:./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation""
-   exit 1
-fi
-
-############维测参数##############
-for para in $*
-do
-    if [[ $para == --precision_mode* ]];then
-        precision_mode=`echo ${para#*=}`
-    elif [[ $para == --over_dump* ]];then
-        over_dump=`echo ${para#*=}`
-        over_dump_path=${cur_path}test/output/overflow_dump
-        mkdir -p ${over_dump_path}
-    elif [[ $para == --data_dump_flag* ]];then
-        data_dump_flag=`echo ${para#*=}`
-        data_dump_path=${cur_path}test/output/data_dump
-        mkdir -p ${data_dump_path}
-    elif [[ $para == --data_dump_step* ]];then
-        data_dump_step=`echo ${para#*=}`
-    elif [[ $para == --profiling* ]];then
-        profiling=`echo ${para#*=}`
-        profiling_dump_path=${cur_path}test/output/profiling
-        mkdir -p ${profiling_dump_path}
-    elif [[ $para == --data_path* ]];then
-        data_path=`echo ${para#*=}`
-    elif [[ $para == --ckpt_path* ]];then
-        ckpt_path=`echo ${para#*=}`
-    elif [[ $para == --use_mixlist* ]];then
-        use_mixlist=`echo ${para#*=}`
-    elif [[ $para == --mixlist_file* ]];then
-        mixlist_file=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_flag* ]];then
-        fusion_off_flag=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_file* ]];then
-        fusion_off_file=`echo ${para#*=}`
-    elif [[ $para == --auto_tune* ]];then
-        auto_tune=`echo ${para#*=}`
-    fi
-done
-############维测参数##############
-
-
-if [[ $data_path  == "" ]];then
-   echo "[Error] para \"data_path\" must be config"
-   exit 1
-fi
-##############执行训练##########
-cd $cur_path
-if [ -d $cur_path/test/output ];then
-   rm -rf $cur_path/test/output/*
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-else
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-fi
-wait
-
-export PYTHONPATH="$PYTHONPATH:$cur_path"
-SQUAD_VERSION=v1.1
-BERT_BASE_DIR=${data_path}/pretrained_weights/wwm_uncased_L-24_H-1024_A-16/
-SQUAD_DIR=${data_path}/SQuAD/v1.1
-MODE=train_and_eval
-#MODE=train
-start=$(date +%s)
-nohup python3 ./official/nlp/bert/run_squad.py \
-   --mode=${MODE} \
-   --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-   --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-   --predict_file=${SQUAD_DIR}/dev-v1.1.json \
-   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
-   --init_checkpoint=${BERT_BASE_DIR}/bert_model.ckpt \
-   --train_batch_size=${batch_size} \
-   --optimizer_type=lamb \
-   --learning_rate=${learning_rate} \
-   --num_train_epochs=${train_epochs} \
-   --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \
-   --log_steps=1 \
-   --steps_per_loop=1 \
-   --train_steps=${train_steps} \
-   --num_gpus=1 \
-   --distribution_strategy=one_device \
-   --sub_model_export_name=sub_model \
-   --precision_mode=${precision_mode} \
-   --over_dump=${over_dump} \
-   --over_dump_path=${over_dump_path} \
-   --data_dump_flag=${data_dump_flag} \
-   --data_dump_step=${data_dump_step} \
-   --data_dump_path=${data_dump_path} \
-   --profiling=${profiling} \
-   --use_mixlist=${use_mixlist} \
-   --fusion_off_flag=${fusion_off_flag} \
-   --mixlist_file=${mixlist_file} \
-   --fusion_off_file=${fusion_off_file} \
-   --profiling_dump_path=${profiling_dump_path} \
-   --auto_tune=${auto_tune} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
-wait
-end=$(date +%s)
-e2e_time=$(( $end - $start ))
-
-#echo "Final Performance ms/step : $average_perf"
-echo "Final Training Duration sec : $e2e_time"
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-FPS=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log |tail -n +2|head -n -1|awk '{print $8}'|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g`
-TrainingTime=`awk 'BEGIN{printf "%.2f\n",'${batch_size}'/'${FPS}'}'`
-wait
-
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-
-#输出训练精度,需要模型审视修改
-train_accuracy=`grep SQuAD $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'`
-#打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
-
-#稳定性精度看护结果汇总
-#训练用例信息，不需要修改
-BatchSize=${batch_size}
-DeviceType=`uname -m`
-CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualFPS=${FPS}
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep 'Train Step' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log |tail -n +2|head -n -1| awk '{print $11}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk 'END {print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
-#关键信息打印到${CaseName}.log中，不需要修改
-echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_1p.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_1p.sh
deleted file mode 100644
index 20535423b70fc694f0debf7ccfb4cf6ff115f709..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_base_1p.sh
+++ /dev/null
@@ -1,191 +0,0 @@
-#!/bin/bash
-
-cur_path=`pwd`/..
-#失败用例打屏
-#export ASCEND_SLOG_PRINT_TO_STDOUT=1
-#基础参数，需要模型审视修改
-#Batch Size
-batch_size=4
-#网络名称，同目录名称
-Network="bert-squad_ID1566_for_TensorFlow2.X"
-#Device数量，单卡默认为1
-RankSize=1
-export RANK_SIZE=1
-#性能优化
-export NPU_LOOP_SIZE=100
-#训练epoch，可选
-train_epochs=1
-#训练step
-train_steps=300
-#学习率
-learning_rate=8e-5
-ckpt_path=""
-#参数配置
-
-# 数据集路径,保持为空,不需要修改
-data_path=""
-############维测参数##############
-precision_mode="allow_mix_precision"
-#维持参数，以下不需要修改
-over_dump=False
-if [[ $over_dump == True ]];then
-    over_dump_path=${cur_path}/test/overflow_dump
-    mkdir -p ${over_dump_path}
-fi
-data_dump_flag=False
-data_dump_step="10"
-profiling=False
-use_mixlist=False
-mixlist_file=${cur_path}/configs/ops_info.json
-fusion_off_flag=False
-fusion_off_file=${cur_path}/configs/fusion_switch.cfg
-auto_tune=False
-############维测参数##############
-
-
-if [[ $1 == --help || $1 == --h ]];then
-   echo "usage:./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation""
-   exit 1
-fi
-
-############维测参数##############
-for para in $*
-do
-    if [[ $para == --precision_mode* ]];then
-        precision_mode=`echo ${para#*=}`
-    elif [[ $para == --over_dump* ]];then
-        over_dump=`echo ${para#*=}`
-        over_dump_path=${cur_path}test/output/overflow_dump
-        mkdir -p ${over_dump_path}
-    elif [[ $para == --data_dump_flag* ]];then
-        data_dump_flag=`echo ${para#*=}`
-        data_dump_path=${cur_path}test/output/data_dump
-        mkdir -p ${data_dump_path}
-    elif [[ $para == --data_dump_step* ]];then
-        data_dump_step=`echo ${para#*=}`
-    elif [[ $para == --profiling* ]];then
-        profiling=`echo ${para#*=}`
-        profiling_dump_path=${cur_path}test/output/profiling
-        mkdir -p ${profiling_dump_path}
-    elif [[ $para == --data_path* ]];then
-        data_path=`echo ${para#*=}`
-    elif [[ $para == --ckpt_path* ]];then
-        ckpt_path=`echo ${para#*=}`
-    elif [[ $para == --use_mixlist* ]];then
-        use_mixlist=`echo ${para#*=}`
-    elif [[ $para == --mixlist_file* ]];then
-        mixlist_file=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_flag* ]];then
-        fusion_off_flag=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_file* ]];then
-        fusion_off_file=`echo ${para#*=}`
-    elif [[ $para == --auto_tune* ]];then
-        auto_tune=`echo ${para#*=}`
-    fi
-done
-############维测参数##############
-
-
-if [[ $data_path  == "" ]];then
-   echo "[Error] para \"data_path\" must be config"
-   exit 1
-fi
-
-##############执行训练##########
-cd $cur_path
-if [ -d $cur_path/test/output ];then
-   rm -rf $cur_path/test/output/*
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-else
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-fi
-wait
-
-export PYTHONPATH="$PYTHONPATH:$cur_path"
-SQUAD_VERSION=v1.1
-BERT_BASE_DIR=${data_path}/pretrained_weights/uncased_L-12_H-768_A-12/
-SQUAD_DIR=${data_path}/SQuAD/v1.1
-MODE=train_and_eval
-#MODE=train
-start=$(date +%s)
-nohup python3 ./official/nlp/bert/run_squad.py \
-   --mode=${MODE} \
-   --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-   --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-   --predict_file=${SQUAD_DIR}/dev-${SQUAD_VERSION}.json \
-   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
-   --init_checkpoint=${BERT_BASE_DIR}/bert_model.ckpt \
-   --train_batch_size=${batch_size} \
-   --optimizer_type=lamb \
-   --learning_rate=${learning_rate} \
-   --num_train_epochs=${train_epochs} \
-   --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \
-   --log_steps=100 \
-   --steps_per_loop=${NPU_LOOP_SIZE} \
-   --train_steps=${train_steps} \
-   --num_gpus=1 \
-   --distribution_strategy=one_device \
-   --sub_model_export_name=sub_model \
-   --precision_mode=${precision_mode} \
-   --over_dump=${over_dump} \
-   --over_dump_path=${over_dump_path} \
-   --data_dump_flag=${data_dump_flag} \
-   --data_dump_step=${data_dump_step} \
-   --data_dump_path=${data_dump_path} \
-   --profiling=${profiling} \
-   --use_mixlist=${use_mixlist} \
-   --fusion_off_flag=${fusion_off_flag} \
-   --mixlist_file=${mixlist_file} \
-   --fusion_off_file=${fusion_off_file} \
-   --profiling_dump_path=${profiling_dump_path} \
-   --auto_tune=${auto_tune} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
-wait
-end=$(date +%s)
-e2e_time=$(( $end - $start ))
-
-#echo "Final Performance ms/step : $average_perf"
-echo "Final Training Duration sec : $e2e_time"
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-FPS=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'|sed -n '2p'`
-wait
-
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-
-#输出训练精度,需要模型审视修改
-train_accuracy=`grep SQuAD $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'`
-#打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
-
-#稳定性精度看护结果汇总
-#训练用例信息，不需要修改
-BatchSize=${batch_size}
-DeviceType=`uname -m`
-CaseName=${Network}_squad1.1_base_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualFPS=${FPS}
-#单迭代训练时长
-trainingTime=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $6}'`
-TrainingTime=`awk 'BEGIN{printf "%.2f",'$trainingTime'/10}'`
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep 'Train Step' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $11}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk '{print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|tail -n 1`
-
-echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_large_1p.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_large_1p.sh
deleted file mode 100644
index 38655d30adfa86162981608f3b09db1bdeed3b43..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad1.1_large_1p.sh
+++ /dev/null
@@ -1,191 +0,0 @@
-#!/bin/bash
-
-cur_path=`pwd`/..
-#失败用例打屏
-#export ASCEND_SLOG_PRINT_TO_STDOUT=1
-#基础参数，需要模型审视修改
-#Batch Size
-batch_size=24
-#网络名称，同目录名称
-Network="bert-squad_ID1566_for_TensorFlow2.X"
-#Device数量，单卡默认为1
-RankSize=1
-export RANK_SIZE=1
-#性能优化
-export NPU_LOOP_SIZE=100
-#训练epoch，可选
-train_epochs=1
-#训练step
-train_steps=300
-#学习率
-learning_rate=8e-5
-ckpt_path=""
-#参数配置
-
-# 数据集路径,保持为空,不需要修改
-data_path=""
-############维测参数##############
-precision_mode="allow_mix_precision"
-#维持参数，以下不需要修改
-over_dump=False
-if [[ $over_dump == True ]];then
-    over_dump_path=${cur_path}/test/overflow_dump
-    mkdir -p ${over_dump_path}
-fi
-data_dump_flag=False
-data_dump_step="10"
-profiling=False
-use_mixlist=False
-mixlist_file=${cur_path}/configs/ops_info.json
-fusion_off_flag=False
-fusion_off_file=${cur_path}/configs/fusion_switch.cfg
-auto_tune=False
-############维测参数##############
-
-
-if [[ $1 == --help || $1 == --h ]];then
-   echo "usage:./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation""
-   exit 1
-fi
-
-############维测参数##############
-for para in $*
-do
-    if [[ $para == --precision_mode* ]];then
-        precision_mode=`echo ${para#*=}`
-    elif [[ $para == --over_dump* ]];then
-        over_dump=`echo ${para#*=}`
-        over_dump_path=${cur_path}test/output/overflow_dump
-        mkdir -p ${over_dump_path}
-    elif [[ $para == --data_dump_flag* ]];then
-        data_dump_flag=`echo ${para#*=}`
-        data_dump_path=${cur_path}test/output/data_dump
-        mkdir -p ${data_dump_path}
-    elif [[ $para == --data_dump_step* ]];then
-        data_dump_step=`echo ${para#*=}`
-    elif [[ $para == --profiling* ]];then
-        profiling=`echo ${para#*=}`
-        profiling_dump_path=${cur_path}test/output/profiling
-        mkdir -p ${profiling_dump_path}
-    elif [[ $para == --data_path* ]];then
-        data_path=`echo ${para#*=}`
-    elif [[ $para == --ckpt_path* ]];then
-        ckpt_path=`echo ${para#*=}`
-    elif [[ $para == --use_mixlist* ]];then
-        use_mixlist=`echo ${para#*=}`
-    elif [[ $para == --mixlist_file* ]];then
-        mixlist_file=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_flag* ]];then
-        fusion_off_flag=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_file* ]];then
-        fusion_off_file=`echo ${para#*=}`
-    elif [[ $para == --auto_tune* ]];then
-        auto_tune=`echo ${para#*=}`
-    fi
-done
-############维测参数##############
-
-
-if [[ $data_path  == "" ]];then
-   echo "[Error] para \"data_path\" must be config"
-   exit 1
-fi
-
-##############执行训练##########
-cd $cur_path
-if [ -d $cur_path/test/output ];then
-   rm -rf $cur_path/test/output/*
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-else
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-fi
-wait
-
-export PYTHONPATH="$PYTHONPATH:$cur_path"
-SQUAD_VERSION=v1.1
-BERT_BASE_DIR=${data_path}/pretrained_weights/wwm_uncased_L-24_H-1024_A-16/
-SQUAD_DIR=${data_path}/SQuAD/v1.1
-MODE=train_and_eval
-#MODE=train
-start=$(date +%s)
-nohup python3 ./official/nlp/bert/run_squad.py \
-   --mode=${MODE} \
-   --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-   --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-   --predict_file=${SQUAD_DIR}/dev-${SQUAD_VERSION}.json \
-   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
-   --init_checkpoint=${BERT_BASE_DIR}/bert_model.ckpt \
-   --train_batch_size=${batch_size} \
-   --optimizer_type=lamb \
-   --learning_rate=${learning_rate} \
-   --num_train_epochs=${train_epochs} \
-   --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \
-   --log_steps=100 \
-   --steps_per_loop=${NPU_LOOP_SIZE} \
-   --train_steps=${train_steps} \
-   --num_gpus=1 \
-   --distribution_strategy=one_device \
-   --sub_model_export_name=sub_model \
-   --precision_mode=${precision_mode} \
-   --over_dump=${over_dump} \
-   --over_dump_path=${over_dump_path} \
-   --data_dump_flag=${data_dump_flag} \
-   --data_dump_step=${data_dump_step} \
-   --data_dump_path=${data_dump_path} \
-   --profiling=${profiling} \
-   --use_mixlist=${use_mixlist} \
-   --fusion_off_flag=${fusion_off_flag} \
-   --mixlist_file=${mixlist_file} \
-   --fusion_off_file=${fusion_off_file} \
-   --profiling_dump_path=${profiling_dump_path} \
-   --auto_tune=${auto_tune} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
-wait
-end=$(date +%s)
-e2e_time=$(( $end - $start ))
-
-#echo "Final Performance ms/step : $average_perf"
-echo "Final Training Duration sec : $e2e_time"
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-FPS=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}' | sed -n '2p'`
-wait
-
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-
-#输出训练精度,需要模型审视修改
-train_accuracy=`grep SQuAD $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'`
-#打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
-
-#稳定性精度看护结果汇总
-#训练用例信息，不需要修改
-BatchSize=${batch_size}
-DeviceType=`uname -m`
-CaseName=${Network}_squad1.1_large_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualFPS=${FPS}
-#单迭代训练时长
-trainingTime=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $6}'`
-TrainingTime=`awk 'BEGIN{printf "%.2f",'$trainingTime'/10}'`
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep 'Train Step' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $11}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk '{print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|tail -n 1`
-
-echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad2.0_base_1p.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad2.0_base_1p.sh
deleted file mode 100644
index 6e6cab667f3297e5cbe4f66f0c5b317729a25032..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad2.0_base_1p.sh
+++ /dev/null
@@ -1,191 +0,0 @@
-#!/bin/bash
-
-cur_path=`pwd`/..
-#失败用例打屏
-#export ASCEND_SLOG_PRINT_TO_STDOUT=1
-#基础参数，需要模型审视修改
-#Batch Size
-batch_size=4
-#网络名称，同目录名称
-Network="bert-squad_ID1566_for_TensorFlow2.X"
-#Device数量，单卡默认为1
-RankSize=1
-export RANK_SIZE=1
-#性能优化
-export NPU_LOOP_SIZE=100
-#训练epoch，可选
-train_epochs=1
-#训练step
-train_steps=300
-#学习率
-learning_rate=8e-5
-ckpt_path=""
-#参数配置
-
-# 数据集路径,保持为空,不需要修改
-data_path=""
-############维测参数##############
-precision_mode="allow_mix_precision"
-#维持参数，以下不需要修改
-over_dump=False
-if [[ $over_dump == True ]];then
-    over_dump_path=${cur_path}/test/overflow_dump
-    mkdir -p ${over_dump_path}
-fi
-data_dump_flag=False
-data_dump_step="10"
-profiling=False
-use_mixlist=False
-mixlist_file=${cur_path}/configs/ops_info.json
-fusion_off_flag=False
-fusion_off_file=${cur_path}/configs/fusion_switch.cfg
-auto_tune=False
-############维测参数##############
-
-
-if [[ $1 == --help || $1 == --h ]];then
-   echo "usage:./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation""
-   exit 1
-fi
-
-############维测参数##############
-for para in $*
-do
-    if [[ $para == --precision_mode* ]];then
-        precision_mode=`echo ${para#*=}`
-    elif [[ $para == --over_dump* ]];then
-        over_dump=`echo ${para#*=}`
-        over_dump_path=${cur_path}test/output/overflow_dump
-        mkdir -p ${over_dump_path}
-    elif [[ $para == --data_dump_flag* ]];then
-        data_dump_flag=`echo ${para#*=}`
-        data_dump_path=${cur_path}test/output/data_dump
-        mkdir -p ${data_dump_path}
-    elif [[ $para == --data_dump_step* ]];then
-        data_dump_step=`echo ${para#*=}`
-    elif [[ $para == --profiling* ]];then
-        profiling=`echo ${para#*=}`
-        profiling_dump_path=${cur_path}test/output/profiling
-        mkdir -p ${profiling_dump_path}
-    elif [[ $para == --data_path* ]];then
-        data_path=`echo ${para#*=}`
-    elif [[ $para == --ckpt_path* ]];then
-        ckpt_path=`echo ${para#*=}`
-    elif [[ $para == --use_mixlist* ]];then
-        use_mixlist=`echo ${para#*=}`
-    elif [[ $para == --mixlist_file* ]];then
-        mixlist_file=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_flag* ]];then
-        fusion_off_flag=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_file* ]];then
-        fusion_off_file=`echo ${para#*=}`
-    elif [[ $para == --auto_tune* ]];then
-        auto_tune=`echo ${para#*=}`
-    fi
-done
-############维测参数##############
-
-
-if [[ $data_path  == "" ]];then
-   echo "[Error] para \"data_path\" must be config"
-   exit 1
-fi
-
-##############执行训练##########
-cd $cur_path
-if [ -d $cur_path/test/output ];then
-   rm -rf $cur_path/test/output/*
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-else
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-fi
-wait
-
-export PYTHONPATH="$PYTHONPATH:$cur_path"
-SQUAD_VERSION=v2.0
-BERT_BASE_DIR=${data_path}/pretrained_weights/uncased_L-12_H-768_A-12/
-SQUAD_DIR=${data_path}/SQuAD/v2.0
-MODE=train_and_eval
-#MODE=train
-start=$(date +%s)
-nohup python3 ./official/nlp/bert/run_squad.py \
-   --mode=${MODE} \
-   --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-   --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-   --predict_file=${SQUAD_DIR}/dev-${SQUAD_VERSION}.json \
-   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
-   --init_checkpoint=${BERT_BASE_DIR}/bert_model.ckpt \
-   --train_batch_size=${batch_size} \
-   --optimizer_type=lamb \
-   --learning_rate=${learning_rate} \
-   --num_train_epochs=${train_epochs} \
-   --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \
-   --log_steps=100 \
-   --steps_per_loop=${NPU_LOOP_SIZE} \
-   --train_steps=${train_steps} \
-   --num_gpus=1 \
-   --distribution_strategy=one_device \
-   --sub_model_export_name=sub_model \
-   --precision_mode=${precision_mode} \
-   --over_dump=${over_dump} \
-   --over_dump_path=${over_dump_path} \
-   --data_dump_flag=${data_dump_flag} \
-   --data_dump_step=${data_dump_step} \
-   --data_dump_path=${data_dump_path} \
-   --profiling=${profiling} \
-   --use_mixlist=${use_mixlist} \
-   --fusion_off_flag=${fusion_off_flag} \
-   --mixlist_file=${mixlist_file} \
-   --fusion_off_file=${fusion_off_file} \
-   --profiling_dump_path=${profiling_dump_path} \
-   --auto_tune=${auto_tune} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
-wait
-end=$(date +%s)
-e2e_time=$(( $end - $start ))
-
-#echo "Final Performance ms/step : $average_perf"
-echo "Final Training Duration sec : $e2e_time"
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-FPS=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'|sed -n '2p'`
-wait
-
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-
-#输出训练精度,需要模型审视修改
-train_accuracy=`grep SQuAD $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'`
-#打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
-
-#稳定性精度看护结果汇总
-#训练用例信息，不需要修改
-BatchSize=${batch_size}
-DeviceType=`uname -m`
-CaseName=${Network}_squad2.0_base_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualFPS=${FPS}
-#单迭代训练时长
-trainingTime=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $6}'`
-TrainingTime=`awk 'BEGIN{printf "%.2f",'$trainingTime'/10}'`
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep 'Train Step' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $11}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk '{print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|tail -n 1`
-
-echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad2.0_large_1p.sh b/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad2.0_large_1p.sh
deleted file mode 100644
index f15f47e35f35704aa407804d74c416838a977211..0000000000000000000000000000000000000000
--- a/TensorFlow/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_performance_squad2.0_large_1p.sh
+++ /dev/null
@@ -1,191 +0,0 @@
-#!/bin/bash
-
-cur_path=`pwd`/..
-#失败用例打屏
-#export ASCEND_SLOG_PRINT_TO_STDOUT=1
-#基础参数，需要模型审视修改
-#Batch Size
-batch_size=24
-#网络名称，同目录名称
-Network="bert-squad_ID1566_for_TensorFlow2.X"
-#Device数量，单卡默认为1
-RankSize=1
-export RANK_SIZE=1
-#性能优化
-export NPU_LOOP_SIZE=100
-#训练epoch，可选
-train_epochs=1
-#训练step
-train_steps=300
-#学习率
-learning_rate=8e-5
-ckpt_path=""
-#参数配置
-
-# 数据集路径,保持为空,不需要修改
-data_path=""
-############维测参数##############
-precision_mode="allow_mix_precision"
-#维持参数，以下不需要修改
-over_dump=False
-if [[ $over_dump == True ]];then
-    over_dump_path=${cur_path}/test/overflow_dump
-    mkdir -p ${over_dump_path}
-fi
-data_dump_flag=False
-data_dump_step="10"
-profiling=False
-use_mixlist=False
-mixlist_file=${cur_path}/configs/ops_info.json
-fusion_off_flag=False
-fusion_off_file=${cur_path}/configs/fusion_switch.cfg
-auto_tune=False
-############维测参数##############
-
-
-if [[ $1 == --help || $1 == --h ]];then
-   echo "usage:./train_performance_1p.sh $data_path --work_dir="$cur_path/estimator_working_dir" --export_path="$cur_path/outputs/models/000001-first_generation""
-   exit 1
-fi
-
-############维测参数##############
-for para in $*
-do
-    if [[ $para == --precision_mode* ]];then
-        precision_mode=`echo ${para#*=}`
-    elif [[ $para == --over_dump* ]];then
-        over_dump=`echo ${para#*=}`
-        over_dump_path=${cur_path}test/output/overflow_dump
-        mkdir -p ${over_dump_path}
-    elif [[ $para == --data_dump_flag* ]];then
-        data_dump_flag=`echo ${para#*=}`
-        data_dump_path=${cur_path}test/output/data_dump
-        mkdir -p ${data_dump_path}
-    elif [[ $para == --data_dump_step* ]];then
-        data_dump_step=`echo ${para#*=}`
-    elif [[ $para == --profiling* ]];then
-        profiling=`echo ${para#*=}`
-        profiling_dump_path=${cur_path}test/output/profiling
-        mkdir -p ${profiling_dump_path}
-    elif [[ $para == --data_path* ]];then
-        data_path=`echo ${para#*=}`
-    elif [[ $para == --ckpt_path* ]];then
-        ckpt_path=`echo ${para#*=}`
-    elif [[ $para == --use_mixlist* ]];then
-        use_mixlist=`echo ${para#*=}`
-    elif [[ $para == --mixlist_file* ]];then
-        mixlist_file=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_flag* ]];then
-        fusion_off_flag=`echo ${para#*=}`
-    elif [[ $para == --fusion_off_file* ]];then
-        fusion_off_file=`echo ${para#*=}`
-    elif [[ $para == --auto_tune* ]];then
-        auto_tune=`echo ${para#*=}`
-    fi
-done
-############维测参数##############
-
-
-if [[ $data_path  == "" ]];then
-   echo "[Error] para \"data_path\" must be config"
-   exit 1
-fi
-
-##############执行训练##########
-cd $cur_path
-if [ -d $cur_path/test/output ];then
-   rm -rf $cur_path/test/output/*
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-else
-   mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
-fi
-wait
-
-export PYTHONPATH="$PYTHONPATH:$cur_path"
-SQUAD_VERSION=v2.0
-BERT_BASE_DIR=${data_path}/pretrained_weights/wwm_uncased_L-24_H-1024_A-16/
-SQUAD_DIR=${data_path}/SQuAD/v2.0
-MODE=train_and_eval
-#MODE=train
-start=$(date +%s)
-nohup python3 ./official/nlp/bert/run_squad.py \
-   --mode=${MODE} \
-   --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-   --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-   --predict_file=${SQUAD_DIR}/dev-${SQUAD_VERSION}.json \
-   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-   --bert_config_file=${BERT_BASE_DIR}/bert_config.json \
-   --init_checkpoint=${BERT_BASE_DIR}/bert_model.ckpt \
-   --train_batch_size=${batch_size} \
-   --optimizer_type=lamb \
-   --learning_rate=${learning_rate} \
-   --num_train_epochs=${train_epochs} \
-   --model_dir=$cur_path/test/output/$ASCEND_DEVICE_ID/ckpt \
-   --log_steps=100 \
-   --steps_per_loop=${NPU_LOOP_SIZE} \
-   --train_steps=${train_steps} \
-   --num_gpus=1 \
-   --distribution_strategy=one_device \
-   --sub_model_export_name=sub_model \
-   --precision_mode=${precision_mode} \
-   --over_dump=${over_dump} \
-   --over_dump_path=${over_dump_path} \
-   --data_dump_flag=${data_dump_flag} \
-   --data_dump_step=${data_dump_step} \
-   --data_dump_path=${data_dump_path} \
-   --profiling=${profiling} \
-   --use_mixlist=${use_mixlist} \
-   --fusion_off_flag=${fusion_off_flag} \
-   --mixlist_file=${mixlist_file} \
-   --fusion_off_file=${fusion_off_file} \
-   --profiling_dump_path=${profiling_dump_path} \
-   --auto_tune=${auto_tune} > $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log 2>&1 &
-wait
-end=$(date +%s)
-e2e_time=$(( $end - $start ))
-
-#echo "Final Performance ms/step : $average_perf"
-echo "Final Training Duration sec : $e2e_time"
-
-#结果打印，不需要修改
-echo "------------------ Final result ------------------"
-#输出性能FPS，需要模型审视修改
-FPS=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}' |sed -n '2p'`
-wait
-
-#打印，不需要修改
-echo "Final Performance images/sec : $FPS"
-
-#输出训练精度,需要模型审视修改
-train_accuracy=`grep SQuAD $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $8}'`
-#打印，不需要修改
-echo "Final Train Accuracy : ${train_accuracy}"
-
-#稳定性精度看护结果汇总
-#训练用例信息，不需要修改
-BatchSize=${batch_size}
-DeviceType=`uname -m`
-CaseName=${Network}_squad2.0_large_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
-
-##获取性能数据，不需要修改
-#吞吐量
-ActualFPS=${FPS}
-#单迭代训练时长
-trainingTime=`grep TimeHistory $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk 'END{print $6}'`
-TrainingTime=`awk 'BEGIN{printf "%.2f",'$trainingTime'/10}'`
-
-#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
-grep 'Train Step' $cur_path/test/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $11}' > $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
-#最后一个迭代loss值，不需要修改
-ActualLoss=`awk '{print}' $cur_path/test/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt|tail -n 1`
-
-echo "Network = ${Network}" > $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "RankSize = ${RANK_SIZE}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "BatchSize = ${BatchSize}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "DeviceType = ${DeviceType}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "CaseName = ${CaseName}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualFPS = ${ActualFPS}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainingTime = ${TrainingTime}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "ActualLoss = ${ActualLoss}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
-echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/test/output/$ASCEND_DEVICE_ID/${CaseName}.log
\ No newline at end of file
diff --git a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/configs/ops_info.json b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/configs/ops_info.json
index 5dc3564522f36d625db1cc85f8cb1fd72967bbe6..3787b29920b698bc80a7c0546737e9d59985a6ad 100644
--- a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/configs/ops_info.json
+++ b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/configs/ops_info.json
@@ -1,8 +1,10 @@
 {
   "black-list": {
     "to-add": [
+          "MatMul",
+          "MatMulV2",
       "Cast",
       "SoftmaxV2"
     ]
   }
-}
+}
\ No newline at end of file
diff --git a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/input_pipeline.py b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/input_pipeline.py
index 6e0d05afa42957fb8a2a35fa77cf77426d24fad8..0ffa75a4e5452b3c4973d8fbf68666db84c83f25 100644
--- a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/input_pipeline.py
+++ b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/input_pipeline.py
@@ -31,6 +31,7 @@
 """BERT model input pipelines."""
 
 import tensorflow as tf
+import npu_device as npu
 
 
 def decode_record(record, name_to_features):
@@ -109,6 +110,8 @@ def create_pretrain_dataset(input_patterns,
   if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
     dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
                             input_pipeline_context.input_pipeline_id)
+  if is_training:
+    dataset, batch_size = npu.distribute.shard_and_rebatch_dataset(dataset, batch_size)
   if is_training:
     dataset = dataset.repeat()
 
@@ -189,6 +192,8 @@ def create_classifier_dataset(file_path,
   if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
     dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
                             input_pipeline_context.input_pipeline_id)
+  if is_training:
+    dataset, batch_size = npu.distribute.shard_and_rebatch_dataset(dataset, batch_size)
 
   def _select_data_from_record(record):
     x = {
@@ -238,6 +243,8 @@ def create_squad_dataset(file_path,
   if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
     dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
                             input_pipeline_context.input_pipeline_id)
+  if is_training:
+    dataset, batch_size = npu.distribute.shard_and_rebatch_dataset(dataset, batch_size)
 
   def _select_data_from_record(record):
     """Dispatches record to features and labels."""
@@ -283,6 +290,8 @@ def create_retrieval_dataset(file_path,
   if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
     dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
                             input_pipeline_context.input_pipeline_id)
+  if is_training:
+    dataset, batch_size = npu.distribute.shard_and_rebatch_dataset(dataset, batch_size)
 
   def _select_data_from_record(record):
     x = {
diff --git a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_training_utils.py b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_training_utils.py
index 189021ecb5e59446bb69bca6c5e17cbf05c4e47c..10f1ead8a35a3ddfd9d44072ec4805335d8d326b 100644
--- a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_training_utils.py
+++ b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/official/nlp/bert/model_training_utils.py
@@ -38,6 +38,7 @@ import tensorflow as tf
 from tensorflow.python.util import deprecation
 from official.common import distribute_utils
 from official.staging.training import grad_utils
+import npu_device as npu
 
 _SUMMARY_TXT = 'training_summary.txt'
 _MIN_SUMMARY_STEPS = 10
@@ -334,6 +335,7 @@ def run_customized_training_loop(
 
     # Collects training variables.
     training_vars = model.trainable_variables
+    npu.distribute.broadcast(training_vars)
 
     def _replicated_step(inputs):
       """Replicated training step."""
@@ -362,6 +364,7 @@ def run_customized_training_loop(
           grads = optimizer.get_unscaled_gradients(scaled_grads)
         else:
           grads = tape.gradient(loss, training_vars)
+        grads = npu.distribute.all_reduce(grads,"mean")
         optimizer.apply_gradients(zip(grads, training_vars))
       # For reporting, the metric takes the mean of losses.
       train_loss_metric.update_state(raw_loss)
diff --git a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad1.1_base_8p.sh b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad1.1_base_8p.sh
index 5b4abc1a321da954bea63351527e72de9262c3d5..84cb15e8662767b431f76287a47b5d236806e448 100644
--- a/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad1.1_base_8p.sh
+++ b/TensorFlow2/built-in/nlp/bert-squad_ID1566_for_TensorFlow2.X/test/train_full_squad1.1_base_8p.sh
@@ -15,11 +15,11 @@ RANK_ID_START=0
 export RANK_SIZE=8
 export RANK_TABLE_FILE=${cur_path}/configs/rank_table_8p.json
 #性能优化
-export NPU_LOOP_SIZE=200
+export NPU_LOOP_SIZE=25
 #训练epoch，可选
 train_epochs=2
 #学习率
-learning_rate=8e-5
+learning_rate=64e-5
 ckpt_path=""
 #参数配置
 
@@ -113,11 +113,23 @@ do
     ASCEND_DEVICE_ID=$RANK_ID
 
     if [ -d $cur_path/test/output ];then
-        rm -rf $cur_path/test/output/*
+        rm -rf $cur_path/test/output/$ASCEND_DEVICE_ID
         mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
     else
         mkdir -p $cur_path/test/output/$ASCEND_DEVICE_ID
     fi
+    
+    #绑核，不需要绑核的模型删除，需要绑核的模型根据实际修改
+    cpucount=`lscpu | grep "CPU(s):" | head -n 1 | awk '{print $2}'`
+    cpustep=`expr $cpucount / 8`
+    echo "taskset c steps:" $cpustep
+    let a=RANK_ID*$cpustep
+    let b=RANK_ID+1
+    let c=b*$cpustep-1
+
+    if [ "x${bind_core}" != x ];then
+        bind_core="taskset -c $a-$c"
+    fi
 
     nohup python3 ./official/nlp/bert/run_squad.py \
         --mode=${MODE} \