From 1f09a306275e8d71263e3fc0dcc933f9266ebb40 Mon Sep 17 00:00:00 2001
From: QiuYao <qiuyao4@hisilicon.com>
Date: Tue, 27 Sep 2022 20:08:25 +0800
Subject: [PATCH 1/3] init add atc code for smith

---
 .../layers_orig.py                            | 570 ++++++++++++++++++
 .../modeling_orig.py                          | 491 +++++++++++++++
 .../preprocessing_smith_orig.py               | 547 +++++++++++++++++
 .../preprocessing_smith_test.py               |  21 +-
 .../preprocessing_smith_test_orig.py          | 108 ++++
 5 files changed, 1730 insertions(+), 7 deletions(-)
 create mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py
 create mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py
 create mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py
 create mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py

diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py
new file mode 100644
index 000000000..7f1254935
--- /dev/null
+++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py
@@ -0,0 +1,570 @@
+# coding=utf-8
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Model layers in dual encoder SMITH model."""
+from npu_bridge.npu_init import *
+from six.moves import range
+from npu_bridge.estimator.npu import npu_convert_dropout
+import tensorflow.compat.v1 as tf
+
+from smith import constants
+from smith.bert import modeling
+
+
+def get_doc_rep_with_masked_sent(input_sent_reps_doc,
+                                 sent_mask_embedding,
+                                 input_mask_doc_level,
+                                 batch_size_static=32,
+                                 max_masked_sent_per_doc=2,
+                                 loop_sent_number_per_doc=32):
+  """Get the document representations with masked sentences.
+
+  Args:
+      input_sent_reps_doc: float Tensor. The independent sentence embeddings
+        without masks for the sentences in the current document. The shape is
+        [batch, loop_sent_number_per_doc, hidden].
+      sent_mask_embedding: float Tensor. The sentence embedding vector for the
+        masked position. The shape is [hidden].
+      input_mask_doc_level: int Tensor. The input masks on the document level to
+        identify whether a location is a real sentence (mask = 1) or a padded
+        sentence (mask = 0). The shape is [batch, loop_sent_number_per_doc].
+      batch_size_static: scalar. The static batch size depending on the training
+        or the evaluation mode.
+      max_masked_sent_per_doc: scalar. The maximum number of masked sentences
+        per document.
+      loop_sent_number_per_doc: scalar. The number of looped sentences per
+        document.
+
+  Returns:
+    The document representations with masked sentences and the positions/
+    weights for each masked sentences. This masked sentence weight is 1 for the
+    sampled real sentence position and 0 for the padded sentence position.
+  """
+  # We at least mask two sentences to build a candidate sentence pool for
+  # negative sentence sampling. We generate the masked_sent_index and
+  # masked_sent_weight for each document. Note that we do not add any word
+  # or sentence level masks during prediction or inference stage.
+  max_masked_sent_per_doc = max(max_masked_sent_per_doc, 2)
+  input_sent_reps_doc_list = tf.unstack(
+      input_sent_reps_doc, num=batch_size_static)
+  real_sent_number_per_doc = tf.unstack(
+      tf.reduce_sum(input_mask_doc_level, 1), num=batch_size_static)
+  masked_sent_index_list = []
+  masked_sent_weight_list = []
+
+  # For each example in the current batch, we randomly sample
+  # max_masked_sent_per_doc positions to mask the sentences. For each masked
+  # sentence position, the sentence in the current position is the positive
+  # example. The other co-masked sentences are the negative examples.
+  # The sampled sentence indexes will not be duplicated.
+  for batch_i in range(0, batch_size_static):
+    # Since everything in TPU must have a fixed shape, here the max sampled
+    # sentence index can be as large as loop_sent_number_per_doc. We will
+    # generate the corresponding sentence LM weights to reduce the impact
+    # on the final masked sentence LM loss following a similar way with the
+    # handling of masked word LM loss and masked word LM weights.
+    real_sent_number = real_sent_number_per_doc[batch_i]
+    sampled_sent_index = tf.slice(
+        tf.random_shuffle(tf.range(loop_sent_number_per_doc)), [0],
+        [max_masked_sent_per_doc])
+    sampled_sent_index = tf.sort(sampled_sent_index)
+    masked_sent_index_list.append(sampled_sent_index)
+    # Generates the corresponding sampled_sent_weight
+    sample_sent_weight = tf.cast(
+        tf.less(sampled_sent_index, real_sent_number), tf.float32)
+    masked_sent_weight_list.append(sample_sent_weight)
+
+    indices = tf.reshape(sampled_sent_index, [max_masked_sent_per_doc, -1])
+    # Duplicates sent_mask_embedding for each masked position.
+    updates = tf.reshape(
+        tf.tile(
+            sent_mask_embedding,
+            [max_masked_sent_per_doc],
+        ), [max_masked_sent_per_doc, -1])
+    input_sent_reps_doc_list[batch_i] = tf.tensor_scatter_update(
+        input_sent_reps_doc_list[batch_i], indices, updates)
+  # Here masked_sent_index_list is a list a tensors, where each tensor stores
+  # the masked sentence positions for each document in the current batch. The
+  # shape of masked_sent_index_list is [batch, max_masked_sent_per_doc].
+  # Here masked_sent_weight_list is a list a tensors, where each tensor stores
+  # the masked sentence weights for each document in the current batch. The
+  # shape of masked_sent_weight_list is [batch, max_masked_sent_per_doc].
+  return (tf.stack(input_sent_reps_doc_list), tf.stack(masked_sent_index_list),
+          tf.stack(masked_sent_weight_list))
+
+
+def get_masked_sent_lm_output(bert_config,
+                              input_tensor,
+                              cur_sent_reps_doc_unmask,
+                              sent_masked_positions,
+                              sent_masked_weights,
+                              debugging=False):
+  """Get the sentence level masked LM loss.
+
+  Args:
+      bert_config: BertConfig object. The configuration file for the document
+        level BERT model.
+      input_tensor: float Tensor. The contextualized representations of all
+        sentences learned by the document level BERT model. The shape is [batch,
+        loop_sent_number_per_doc, hidden]. This is the model prediction.
+      cur_sent_reps_doc_unmask: float Tensor. The unmasked sentence
+        representations of the current document. The shape is [batch,
+        loop_sent_number_per_doc, hidden]. This is the source of the ground
+        truth and negative examples in the masked sentence prediction.
+      sent_masked_positions: int Tensor. The masked sentence positions in the
+        current document. The shape is [batch, max_masked_sent_per_doc].
+      sent_masked_weights: float Tensor. The masked sentence weights in the
+        current document. The shape is [batch, max_masked_sent_per_doc].
+      debugging: bool. Whether it is in the debugging mode.
+
+  Returns:
+    The masked sentence LM loss and the mask sentence LM loss per example.
+
+  """
+  # The current method for masked sentence prediction: we approach this problem
+  # as a multi-class classification problem similar to the masked word LM task.
+  # For each masked sentence position, the sentence in the current position is
+  # the positive example. The other co-masked sentences in the current document
+  # and in the other documents of the same batch are the negative examples. We
+  # compute the cross entropy loss over the sentence prediction task following
+  # the implementation of the masked word LM loss in the BERT model.
+
+  input_tensor_shape = modeling.get_shape_list(input_tensor)
+  batch_size = input_tensor_shape[0]
+  masked_position_shape = modeling.get_shape_list(sent_masked_positions)
+  max_predictions_per_seq = masked_position_shape[1]
+
+  # In the context of masked sentence prediction, the max_predictions_per_seq
+  # is the same with max_masked_sent_per_doc.
+  # Output Shape: [batch * max_predictions_per_seq, hidden].
+  # Input_tensor is the model prediction for each position.
+  input_tensor = gather_indexes(input_tensor, sent_masked_positions)
+  # Independent_sent_embeddings is the ground truth input sentence embeddings
+  # for the document level BERT model. The output shape is [batch *
+  # max_predictions_per_seq, hidden].
+  independent_sent_embeddings = gather_indexes(cur_sent_reps_doc_unmask,
+                                               sent_masked_positions)
+
+  with tf.variable_scope("cls/sent_predictions", reuse=tf.AUTO_REUSE):
+    # We apply one more non-linear transformation before the output layer.
+    # This matrix is not used after pre-training.
+    with tf.variable_scope("transform"):
+      input_tensor = tf.layers.dense(
+          input_tensor,
+          units=bert_config.hidden_size,
+          activation=modeling.get_activation(bert_config.hidden_act),
+          kernel_initializer=modeling.create_initializer(
+              bert_config.initializer_range))
+      # Output Shape: [batch * max_predictions_per_seq, hidden].
+      input_tensor = modeling.layer_norm(input_tensor)
+
+    # The output weights are the same as the input embeddings, but there is
+    # an output-only bias for each predicted position.
+    output_bias = tf.get_variable(
+        "output_bias",
+        shape=[batch_size * max_predictions_per_seq],
+        initializer=tf.zeros_initializer())
+    # Shape of input_tensor [batch * max_predictions_per_seq, hidden].
+    # Shape of independent_sent_embeddings is [batch * max_predictions_per_seq,
+    # hidden].
+    # Shape of logits: [batch * max_predictions_per_seq,
+    # batch * max_predictions_per_seq].
+    logits = tf.matmul(
+        input_tensor, independent_sent_embeddings, transpose_b=True)
+    logits = tf.nn.bias_add(logits, output_bias)
+    # Output Shape: [batch * max_predictions_per_seq,
+    # batch * max_predictions_per_seq].
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+    # Output Shape: [batch * max_predictions_per_seq].
+    # Double checked the setting of label_ids here. The label_ids
+    # should be the label index in the "sentence vocabulary". Thus if batch=32,
+    # max_predictions_per_seq = 2, then label ids should be like
+    # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ..., 63]. For the ground truth one hot
+    # label matrix, only the values in the diagonal positions are 1. All the
+    # other positions should be 0.
+    label_ids = tf.range(
+        0, batch_size * max_predictions_per_seq, dtype=tf.int32)
+    if debugging:
+      label_ids = tf.Print(
+          label_ids, [label_ids],
+          message="label_ids in get_masked_sent_lm_output",
+          summarize=30)
+    # Output Shape: [batch * max_predictions_per_seq].
+    # The label_weights is the flatten vector based on sent_masked_weights,
+    # where the weight is 1.0 for sampled real sentences and 0.0 for sampled
+    # masked sentences.
+    label_weights = tf.reshape(sent_masked_weights, [-1])
+
+    # Output Shape: [batch * max_predictions_per_seq,
+    # batch * max_predictions_per_seq].
+    one_hot_labels = tf.one_hot(
+        label_ids, depth=batch_size * max_predictions_per_seq, dtype=tf.float32)
+
+    # Output Shape: [batch * max_predictions_per_seq].
+    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
+    # Output Shape: [1].
+    numerator = tf.reduce_sum(label_weights * per_example_loss)
+    # Output Shape: [1].
+    denominator = tf.reduce_sum(label_weights) + 1e-5
+    # Output Shape: [1].
+    loss = numerator / denominator
+    # Shape of loss [1].
+    # Shape of per_example_loss is [batch * max_predictions_per_seq].
+  return (loss, per_example_loss, log_probs)
+
+
+def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
+                         label_ids, label_weights):
+  """Get loss and log probs for the masked LM."""
+  # Output Shape: [batch * max_predictions_per_seq, hidden].
+  input_tensor = gather_indexes(input_tensor, positions)
+
+  with tf.variable_scope("cls/word_predictions", reuse=tf.AUTO_REUSE):
+    # We apply one more non-linear transformation before the output layer.
+    # This matrix is not used after pre-training.
+    with tf.variable_scope("transform"):
+      input_tensor = tf.layers.dense(
+          input_tensor,
+          units=bert_config.hidden_size,
+          activation=modeling.get_activation(bert_config.hidden_act),
+          kernel_initializer=modeling.create_initializer(
+              bert_config.initializer_range))
+      # Output Shape: [batch * max_predictions_per_seq, hidden].
+      input_tensor = modeling.layer_norm(input_tensor)
+
+    # The output weights are the same as the input embeddings, but there is
+    # an output-only bias for each token.
+    output_bias = tf.get_variable(
+        "output_bias",
+        shape=[bert_config.vocab_size],
+        initializer=tf.zeros_initializer())
+    # Shape of input_tensor [batch * max_predictions_per_seq, embedding_size].
+    # Shape of output_weights (embed table) is [vocab_size, embedding_size].
+    # In the current Bert implementation: embedding_size = hidden.
+    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
+    logits = tf.nn.bias_add(logits, output_bias)
+    # Output Shape: [batch * max_predictions_per_seq, vocab_size].
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+    # Output Shape: [batch * max_predictions_per_seq].
+    label_ids = tf.reshape(label_ids, [-1])
+    # Output Shape: [batch * max_predictions_per_seq].
+    label_weights = tf.reshape(label_weights, [-1])
+
+    # Output Shape: [batch * max_predictions_per_seq, vocab_size].
+    one_hot_labels = tf.one_hot(
+        label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
+
+    # The `positions` tensor might be zero-padded (if the sequence is too
+    # short to have the maximum number of predictions). The `label_weights`
+    # tensor has a value of 1.0 for every real prediction and 0.0 for the
+    # padding predictions.
+    # Output Shape: [batch * max_predictions_per_seq].
+    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
+    # Output Shape: [1].
+    numerator = tf.reduce_sum(label_weights * per_example_loss)
+    # Output Shape: [1].
+    denominator = tf.reduce_sum(label_weights) + 1e-5
+    # Output Shape: [1].
+    loss = numerator / denominator
+    # Shape of loss [1].
+    # Shape of per_example_loss is [batch * max_predictions_per_seq].
+  return (loss, per_example_loss, log_probs)
+
+
+def gather_indexes(sequence_tensor, positions):
+  """Gathers the vectors at the specific positions over a minibatch."""
+  # Shape of positions: [batch, max_mask_per_seq].
+  sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
+  batch_size = sequence_shape[0]
+  seq_length = sequence_shape[1]
+  width = sequence_shape[2]
+
+  # Shape of flat_offsets: [batch, 1].
+  flat_offsets = tf.reshape(
+      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+  flat_positions = tf.reshape(positions + flat_offsets, [-1])
+  flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                    [batch_size * seq_length, width])
+  output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+  # The shape of output_tensor [batch * max_mask_per_seq, hidden].
+  return output_tensor
+
+
+def get_attention_weighted_sum(input_tensor, bert_config, is_training,
+                               attention_size):
+  """Compute the attentive weighted sum of an input tensor.
+
+  Args:
+      input_tensor: The input tensor for attentive representation. The shape of
+        input tensor is [batch, seq_length, hidden].
+      bert_config: The model config file.
+      is_training: If true, it is in training mode.
+      attention_size: int. Dimension of contextual vector.
+
+  Returns:
+      The attentive representation of the input tensor. The shape of the output
+        tensor is [batch, hidden].
+  """
+  with tf.variable_scope("combine_reps_attention", reuse=tf.AUTO_REUSE):
+    context_vector = tf.get_variable(
+        name="context_vector",
+        shape=[attention_size],
+        dtype=tf.float32)
+    # Output Shape: [batch, seq_length, attention_size].
+    projection = tf.layers.dense(
+        input_tensor,
+        attention_size,
+        activation=tf.tanh,
+        kernel_initializer=modeling.create_initializer(
+            bert_config.initializer_range))
+    # Output Shape: [batch, seq_length, 1].
+    attention = tf.reduce_sum(
+        tf.multiply(projection, context_vector), axis=2, keep_dims=True)
+    # Output Shape: [batch, seq_length, 1].
+    attention = tf.nn.softmax(attention, axis=1)
+    # Output Shape: [batch, hidden].
+    last_outputs = tf.reduce_sum(tf.multiply(input_tensor, attention), axis=1)
+    if is_training:
+      last_outputs = tf.layers.dropout(
+          last_outputs, bert_config.attention_probs_dropout_prob, training=True)
+  return last_outputs
+
+
+def get_seq_rep_from_bert(bert_model):
+  """Get the sequence represenation given a BERT encoder."""
+  siamese_input_tensor = bert_model.get_pooled_output()
+  hidden_size = siamese_input_tensor.shape[-1].value
+  siamese_input_tensor = tf.layers.dense(
+      siamese_input_tensor, units=hidden_size, activation=tf.nn.relu)
+  normalized_siamese_input_tensor = tf.nn.l2_normalize(
+      siamese_input_tensor, axis=1)
+  return normalized_siamese_input_tensor
+
+
+def get_sent_reps_masks_normal_loop(sent_index,
+                                    input_sent_reps_doc,
+                                    input_mask_doc_level,
+                                    masked_lm_loss_doc,
+                                    masked_lm_example_loss_doc,
+                                    masked_lm_weights_doc,
+                                    dual_encoder_config,
+                                    is_training,
+                                    train_mode,
+                                    input_ids,
+                                    input_mask,
+                                    masked_lm_positions,
+                                    masked_lm_ids,
+                                    masked_lm_weights,
+                                    use_one_hot_embeddings,
+                                    debugging=False):
+  """Get the sentence encodings, mask ids and masked word LM loss.
+
+  Args:
+      sent_index: The index of the current looped sentence.
+      input_sent_reps_doc: The representations of all sentences in the doc
+        learned by BERT.
+      input_mask_doc_level: The document level input masks, which indicates
+        whether a sentence is a real sentence or a padded sentence.
+      masked_lm_loss_doc: The sum of all the masked word LM loss.
+      masked_lm_example_loss_doc: The per example masked word LM loss.
+      masked_lm_weights_doc: the weights of the maksed LM words. If the position
+        is corresponding to a real masked word, it is 1.0; It is a padded mask,
+        the weight is 0.
+      dual_encoder_config: The config of the dual encoder.
+      is_training: Whether it is in the training mode.
+      train_mode: string. The train mode which can be finetune, joint_train, or
+        pretrain.
+      input_ids: The ids of the input tokens.
+      input_mask: The mask of the input tokens.
+      masked_lm_positions: The positions of the masked words in the language
+        model training.
+      masked_lm_ids: The ids of the masked words in LM model training.
+      masked_lm_weights: The weights of the masked words in LM model training.
+      use_one_hot_embeddings: Whether use one hot embedding. It should be true
+        for the runs on TPUs.
+      debugging: bool. Whether it is in the debugging mode.
+
+  Returns:
+    A list of tensors on the learned sentence representations and the masked
+    word LM loss.
+  """
+  # Collect token information for the current sentence.
+  bert_config = modeling.BertConfig.from_json_file(
+      dual_encoder_config.encoder_config.bert_config_file)
+  max_sent_length_by_word = dual_encoder_config.encoder_config.max_sent_length_by_word
+  sent_bert_trainable = dual_encoder_config.encoder_config.sent_bert_trainable
+  max_predictions_per_seq = dual_encoder_config.encoder_config.max_predictions_per_seq
+  sent_start = sent_index * max_sent_length_by_word
+  input_ids_cur_sent = tf.slice(input_ids, [0, sent_start],
+                                [-1, max_sent_length_by_word])
+  # Output shape: [batch, max_sent_length_by_word].
+  input_mask_cur_sent = tf.slice(input_mask, [0, sent_start],
+                                 [-1, max_sent_length_by_word])
+  # Output Shape:  [batch].
+  input_mask_cur_sent_max = tf.reduce_max(input_mask_cur_sent, 1)
+  # Output Shape:  [loop_sent_number_per_doc, batch].
+  input_mask_doc_level.append(input_mask_cur_sent_max)
+  if debugging:
+    input_ids_cur_sent = tf.Print(
+        input_ids_cur_sent, [input_ids_cur_sent, input_mask_cur_sent],
+        message="input_ids_cur_sent in get_sent_reps_masks_lm_loss",
+        summarize=20)
+  model = modeling.BertModel(
+      config=bert_config,
+      is_training=is_training,
+      input_ids=input_ids_cur_sent,
+      input_mask=input_mask_cur_sent,
+      use_one_hot_embeddings=use_one_hot_embeddings,
+      sent_bert_trainable=sent_bert_trainable)
+  with tf.variable_scope("seq_rep_from_bert_sent_dense", reuse=tf.AUTO_REUSE):
+    normalized_siamese_input_tensor = get_seq_rep_from_bert(model)
+  input_sent_reps_doc.append(normalized_siamese_input_tensor)
+
+  if (train_mode == constants.TRAIN_MODE_PRETRAIN or
+      train_mode == constants.TRAIN_MODE_JOINT_TRAIN):
+    # Collect masked token information for the current sentence.
+    sent_mask_lm_token_start = sent_index * max_predictions_per_seq
+    # Output shape: [batch, max_predictions_per_seq].
+    masked_lm_positions_cur_sent = tf.slice(masked_lm_positions,
+                                            [0, sent_mask_lm_token_start],
+                                            [-1, max_predictions_per_seq])
+    masked_lm_ids_cur_sent = tf.slice(masked_lm_ids,
+                                      [0, sent_mask_lm_token_start],
+                                      [-1, max_predictions_per_seq])
+    masked_lm_weights_cur_sent = tf.slice(masked_lm_weights,
+                                          [0, sent_mask_lm_token_start],
+                                          [-1, max_predictions_per_seq])
+    # Since in the processed data of smith model, the masked lm positions are
+    # global indices started from the 1st token of the whole sequence, we need
+    # to transform this global position to a local position for the current
+    # sentence. The position index is started from 0.
+    # Local_index = global_index mod max_sent_length_by_word.
+    masked_lm_positions_cur_sent = tf.mod(masked_lm_positions_cur_sent,
+                                          max_sent_length_by_word)
+    # Shape of masked_lm_loss_cur_sent [1].
+    # Shape of masked_lm_example_loss_cur_sent is [batch,
+    # max_predictions_per_seq].
+    (masked_lm_loss_cur_sent, masked_lm_example_loss_cur_sent,
+     _) = get_masked_lm_output(bert_config, model.get_sequence_output(),
+                               model.get_embedding_table(),
+                               masked_lm_positions_cur_sent,
+                               masked_lm_ids_cur_sent,
+                               masked_lm_weights_cur_sent)
+    # Output Shape: [1].
+    masked_lm_loss_doc += masked_lm_loss_cur_sent
+    # Output Shape: [loop_sent_number_per_doc, batch * max_predictions_per_seq].
+    masked_lm_example_loss_doc.append(masked_lm_example_loss_cur_sent)
+    # Output Shape: [loop_sent_number_per_doc, batch, max_predictions_per_seq].
+    masked_lm_weights_doc.append(masked_lm_weights_cur_sent)
+  return (input_sent_reps_doc, input_mask_doc_level, masked_lm_loss_doc,
+          masked_lm_example_loss_doc, masked_lm_weights_doc)
+
+
+def learn_sent_reps_normal_loop(dual_encoder_config, is_training, train_mode,
+                                input_ids_1, input_mask_1,
+                                masked_lm_positions_1, masked_lm_ids_1,
+                                masked_lm_weights_1, input_ids_2, input_mask_2,
+                                masked_lm_positions_2, masked_lm_ids_2,
+                                masked_lm_weights_2, use_one_hot_embeddings):
+  """Learn the sentence representations with normal loop functions."""
+  input_sent_reps_doc_1 = []
+  # Generate document level input masks on each sentence based on the word
+  # level input mask information.
+  input_mask_doc_level_1 = []
+  masked_lm_loss_doc_1 = 0.0
+  masked_lm_example_loss_doc_1 = []
+  masked_lm_weights_doc_1 = []
+
+  input_mask_doc_level_2 = []
+  input_sent_reps_doc_2 = []
+  masked_lm_loss_doc_2 = 0.0
+  masked_lm_example_loss_doc_2 = []
+  masked_lm_weights_doc_2 = []
+
+  # Learn the representation for each sentence in the document.
+  # Setting smaller number of loop_sent_number_per_doc can save memory for the
+  # model training.
+  # Shape of masked_lm_loss_doc_1 [1].
+  # Shape of masked_lm_example_loss_doc_1 is [max_doc_length_by_sentence,
+  # batch * max_predictions_per_seq].
+  for sent_index in range(
+      0, dual_encoder_config.encoder_config.loop_sent_number_per_doc):
+    (input_sent_reps_doc_1, input_mask_doc_level_1, masked_lm_loss_doc_1,
+     masked_lm_example_loss_doc_1,
+     masked_lm_weights_doc_1) = get_sent_reps_masks_normal_loop(
+         sent_index, input_sent_reps_doc_1, input_mask_doc_level_1,
+         masked_lm_loss_doc_1, masked_lm_example_loss_doc_1,
+         masked_lm_weights_doc_1, dual_encoder_config, is_training, train_mode,
+         input_ids_1, input_mask_1, masked_lm_positions_1, masked_lm_ids_1,
+         masked_lm_weights_1, use_one_hot_embeddings)
+    (input_sent_reps_doc_2, input_mask_doc_level_2, masked_lm_loss_doc_2,
+     masked_lm_example_loss_doc_2,
+     masked_lm_weights_doc_2) = get_sent_reps_masks_normal_loop(
+         sent_index, input_sent_reps_doc_2, input_mask_doc_level_2,
+         masked_lm_loss_doc_2, masked_lm_example_loss_doc_2,
+         masked_lm_weights_doc_2, dual_encoder_config, is_training, train_mode,
+         input_ids_2, input_mask_2, masked_lm_positions_2, masked_lm_ids_2,
+         masked_lm_weights_2, use_one_hot_embeddings)
+
+  # Stack the sentence representations to learn the doc representations.
+  # Output Shape: [batch, loop_sent_number_per_doc, hidden].
+  input_sent_reps_doc_1_unmask = tf.stack(input_sent_reps_doc_1, axis=1)
+  input_sent_reps_doc_2_unmask = tf.stack(input_sent_reps_doc_2, axis=1)
+
+  # Output Shape:  [batch, loop_sent_number_per_doc].
+  input_mask_doc_level_1_tensor = tf.stack(input_mask_doc_level_1, axis=1)
+  input_mask_doc_level_2_tensor = tf.stack(input_mask_doc_level_2, axis=1)
+
+  if (train_mode == constants.TRAIN_MODE_PRETRAIN or
+      train_mode == constants.TRAIN_MODE_JOINT_TRAIN):
+    # Output Shape:  [batch * max_predictions_per_seq,
+    # loop_sent_number_per_doc].
+    masked_lm_example_loss_doc_1 = tf.stack(
+        masked_lm_example_loss_doc_1, axis=1)
+    masked_lm_example_loss_doc_2 = tf.stack(
+        masked_lm_example_loss_doc_2, axis=1)
+
+    # Output Shape:  [batch, loop_sent_number_per_doc, max_predictions_per_seq].
+    masked_lm_weights_doc_1 = tf.stack(masked_lm_weights_doc_1, axis=1)
+    masked_lm_weights_doc_2 = tf.stack(masked_lm_weights_doc_2, axis=1)
+  else:
+    masked_lm_example_loss_doc_1 = tf.zeros([1])
+    masked_lm_example_loss_doc_2 = tf.zeros([1])
+    masked_lm_weights_doc_1 = tf.zeros([1])
+    masked_lm_weights_doc_2 = tf.zeros([1])
+
+  return (input_sent_reps_doc_1_unmask, input_mask_doc_level_1_tensor,
+          input_sent_reps_doc_2_unmask, input_mask_doc_level_2_tensor,
+          masked_lm_loss_doc_1, masked_lm_loss_doc_2,
+          masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2,
+          masked_lm_weights_doc_1, masked_lm_weights_doc_2)
+
diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py
new file mode 100644
index 000000000..a2373128f
--- /dev/null
+++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py
@@ -0,0 +1,491 @@
+# coding=utf-8
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Dual encoder SMITH models."""
+from npu_bridge.npu_init import *
+
+import tensorflow.compat.v1 as tf
+
+from smith import constants
+from smith import layers
+from smith import loss_fns
+from smith import metric_fns
+from smith import utils
+from smith.bert import modeling
+from smith.bert import optimization
+
+# Add by:TC
+import precision_tool.tf_config as npu_tf_config
+
+def build_smith_dual_encoder(dual_encoder_config,
+                             train_mode,
+                             is_training,
+                             input_ids_1,
+                             input_mask_1,
+                             masked_lm_positions_1,
+                             masked_lm_ids_1,
+                             masked_lm_weights_1,
+                             input_ids_2,
+                             input_mask_2,
+                             masked_lm_positions_2,
+                             masked_lm_ids_2,
+                             masked_lm_weights_2,
+                             use_one_hot_embeddings,
+                             documents_match_labels,
+                             debugging=False):
+  """Build the dual encoder SMITH model.
+
+  Args:
+    dual_encoder_config: the configuration file for the dual encoder model.
+    train_mode: string. The train mode of the current. It can be finetune,
+      pretrain or joint_train.
+    is_training: bool. Whether it in training mode.
+    input_ids_1: int Tensor with shape [batch, max_seq_length]. The input ids of
+      input examples of text 1.
+    input_mask_1: int Tensor with shape [batch, max_seq_length]. The input masks
+      of input examples of text 1.
+    masked_lm_positions_1: int Tensor with shape [batch,
+      max_predictions_per_seq]. The input masked LM prediction positions of
+      input examples of text 1. This can be useful to compute the masked word
+      prediction LM loss.
+    masked_lm_ids_1: int Tensor with shape [batch, max_predictions_per_seq]. The
+      input masked LM prediction ids of input examples of text 1. It is the
+      ground truth in the masked word LM prediction task. This can be useful to
+      compute the masked word prediction LM loss.
+    masked_lm_weights_1: float Tensor with shape [batch,
+      max_predictions_per_seq]. The input masked LM prediction weights of input
+      examples of text 1.
+    input_ids_2: int Tensor with shape [batch, max_seq_length]. The input ids of
+      input examples of text 2.
+    input_mask_2: int Tensor with shape [batch, max_seq_length]. The input masks
+      of input examples of text 2.
+    masked_lm_positions_2: int Tensor with shape [batch,
+      max_predictions_per_seq]. The input masked LM prediction positions of
+      input examples of text 2. This can be useful to compute the masked word
+      prediction LM loss.
+    masked_lm_ids_2: int Tensor with shape [batch, max_predictions_per_seq]. The
+      input masked LM prediction ids of input examples of text 2. It is the
+      ground truth in the masked word LM prediction task. This can be useful to
+      compute the masked word prediction LM loss.
+    masked_lm_weights_2: float Tensor with shape [batch,
+      max_predictions_per_seq]. The input masked LM prediction weights of input
+      examples of text 2.
+    use_one_hot_embeddings: bool. Whether use one hot embeddings.
+    documents_match_labels: float Tensor with shape [batch]. The ground truth
+      labels for the input examples.
+    debugging: bool. Whether it is in the debugging mode.
+
+  Returns:
+    The masked LM loss, per example LM loss, masked sentence LM loss, per
+    example masked sentence LM loss, sequence representations, text matching
+    loss, per example text matching loss, text matching logits, text matching
+    probabilities and text matching log probabilities.
+
+  Raises:
+    ValueError: if the doc_rep_combine_mode in dual_encoder_config is invalid.
+  """
+  bert_config = modeling.BertConfig.from_json_file(
+      dual_encoder_config.encoder_config.bert_config_file)
+  doc_bert_config = modeling.BertConfig.from_json_file(
+      dual_encoder_config.encoder_config.doc_bert_config_file)
+  (input_sent_reps_doc_1_unmask, input_mask_doc_level_1_tensor,
+   input_sent_reps_doc_2_unmask, input_mask_doc_level_2_tensor,
+   masked_lm_loss_doc_1, masked_lm_loss_doc_2, masked_lm_example_loss_doc_1,
+   masked_lm_example_loss_doc_2, masked_lm_weights_doc_1,
+   masked_lm_weights_doc_2) = layers.learn_sent_reps_normal_loop(
+       dual_encoder_config, is_training, train_mode, input_ids_1, input_mask_1,
+       masked_lm_positions_1, masked_lm_ids_1, masked_lm_weights_1, input_ids_2,
+       input_mask_2, masked_lm_positions_2, masked_lm_ids_2,
+       masked_lm_weights_2, use_one_hot_embeddings)
+  if debugging:
+    input_mask_doc_level_1_tensor = tf.Print(
+        input_mask_doc_level_1_tensor,
+        [input_mask_doc_level_1_tensor, input_mask_doc_level_2_tensor],
+        message="input_mask_doc_level_1_tensor in build_smith_dual_encoder",
+        summarize=30)
+
+  if dual_encoder_config.encoder_config.use_masked_sentence_lm_loss:
+    batch_size_static = (
+        dual_encoder_config.train_eval_config.train_batch_size if is_training
+        else dual_encoder_config.train_eval_config.eval_batch_size)
+    # Generates the sentence masked document represenations.
+    with tf.variable_scope("mask_sent_in_doc", reuse=tf.AUTO_REUSE):
+      # Randomly initialize a masked sentence vector and reuse it.
+      # We also need to return the masked sentence position index to get the
+      # ground truth labels for the masked positions. The shape of
+      # sent_mask_embedding is [hidden].
+      sent_mask_embedding = tf.get_variable(
+          name="sentence_mask_embedding",
+          shape=[bert_config.hidden_size],
+          initializer=tf.truncated_normal_initializer(
+              stddev=bert_config.initializer_range))
+      # Output Shape: [batch, loop_sent_number_per_doc, hidden].
+      (input_sent_reps_doc_1_masked, masked_sent_index_1,
+       masked_sent_weight_1) = layers.get_doc_rep_with_masked_sent(
+           input_sent_reps_doc=input_sent_reps_doc_1_unmask,
+           sent_mask_embedding=sent_mask_embedding,
+           input_mask_doc_level=input_mask_doc_level_1_tensor,
+           batch_size_static=batch_size_static,
+           max_masked_sent_per_doc=dual_encoder_config.encoder_config
+           .max_masked_sent_per_doc,
+           loop_sent_number_per_doc=dual_encoder_config.encoder_config
+           .loop_sent_number_per_doc)
+      (input_sent_reps_doc_2_masked, masked_sent_index_2,
+       masked_sent_weight_2) = layers.get_doc_rep_with_masked_sent(
+           input_sent_reps_doc=input_sent_reps_doc_2_unmask,
+           sent_mask_embedding=sent_mask_embedding,
+           input_mask_doc_level=input_mask_doc_level_2_tensor,
+           batch_size_static=batch_size_static,
+           max_masked_sent_per_doc=dual_encoder_config.encoder_config
+           .max_masked_sent_per_doc,
+           loop_sent_number_per_doc=dual_encoder_config.encoder_config
+           .loop_sent_number_per_doc)
+    # Learn the document representations based on masked sentence embeddings.
+    # Note that the variables in the DocBert model are not within the
+    # "mask_sent_in_doc" variable scope.
+    model_doc_1 = modeling.DocBertModel(
+        config=doc_bert_config,
+        is_training=is_training,
+        input_reps=input_sent_reps_doc_1_masked,
+        input_mask=input_mask_doc_level_1_tensor)
+    model_doc_2 = modeling.DocBertModel(
+        config=doc_bert_config,
+        is_training=is_training,
+        input_reps=input_sent_reps_doc_2_masked,
+        input_mask=input_mask_doc_level_2_tensor)
+    # Shape of masked_sent_lm_loss_1 [1].
+    # Shape of masked_sent_lm_example_loss_1 is [batch *
+    # max_predictions_per_seq].
+    (masked_sent_lm_loss_1, masked_sent_per_example_loss_1,
+     _) = layers.get_masked_sent_lm_output(doc_bert_config,
+                                           model_doc_1.get_sequence_output(),
+                                           input_sent_reps_doc_1_unmask,
+                                           masked_sent_index_1,
+                                           masked_sent_weight_1)
+    (masked_sent_lm_loss_2, masked_sent_per_example_loss_2,
+     _) = layers.get_masked_sent_lm_output(doc_bert_config,
+                                           model_doc_2.get_sequence_output(),
+                                           input_sent_reps_doc_2_unmask,
+                                           masked_sent_index_2,
+                                           masked_sent_weight_2)
+  else:
+    # Learn the document representations based on unmasked sentence embeddings.
+    model_doc_1 = modeling.DocBertModel(
+        config=doc_bert_config,
+        is_training=is_training,
+        input_reps=input_sent_reps_doc_1_unmask,
+        input_mask=input_mask_doc_level_1_tensor)
+    model_doc_2 = modeling.DocBertModel(
+        config=doc_bert_config,
+        is_training=is_training,
+        input_reps=input_sent_reps_doc_2_unmask,
+        input_mask=input_mask_doc_level_2_tensor)
+    masked_sent_lm_loss_1 = 0
+    masked_sent_lm_loss_2 = 0
+    masked_sent_per_example_loss_1 = tf.zeros(1)
+    masked_sent_per_example_loss_2 = tf.zeros(1)
+    masked_sent_weight_1 = tf.zeros(1)
+    masked_sent_weight_2 = tf.zeros(1)
+
+  with tf.variable_scope("seq_rep_from_bert_doc_dense", reuse=tf.AUTO_REUSE):
+    normalized_doc_rep_1 = layers.get_seq_rep_from_bert(model_doc_1)
+    normalized_doc_rep_2 = layers.get_seq_rep_from_bert(model_doc_2)
+
+    # We also dump the contextualized sentence embedding output by document
+    # level Transformer model. These representations maybe useful for sentence
+    # level tasks.
+    output_sent_reps_doc_1 = model_doc_1.get_sequence_output()
+    output_sent_reps_doc_2 = model_doc_2.get_sequence_output()
+
+  # Here we support multiple modes to generate the final document
+  # representations based on the word/sentence/document level representations
+  # 1. normal: only use the document level representation as the final document
+  # representations.
+  # 2. sum_concat: firstly compute the sum of all sentence level repsentations.
+  # Then concatenate the sum vector with the document level representations.
+  # 3. mean_concat: firstly compute the mean of all sentence level
+  # repsentations. Then concatenate the mean vector with the document level
+  # representations.
+  # 4. attention: firstly compute the weighted sum of sentence level
+  # representations with attention mechanism, then concatenate the weighted sum
+  # vector with the document level representations.
+  # The document level mask is to indicate whether each sentence is
+  # a real sentence (1) or a paded sentence (0). The shape of
+  # input_mask_doc_level_1_tensor is [batch, max_doc_length_by_sentence]. The
+  # shape of input_sent_reps_doc_1_unmask is
+  # [batch, max_doc_length_by_sentence, hidden].
+  final_doc_rep_combine_mode = dual_encoder_config.encoder_config.doc_rep_combine_mode
+  if final_doc_rep_combine_mode == constants.DOC_COMBINE_NORMAL:
+    final_doc_rep_1 = normalized_doc_rep_1
+    final_doc_rep_2 = normalized_doc_rep_2
+  elif final_doc_rep_combine_mode == constants.DOC_COMBINE_SUM_CONCAT:
+    # Output Shape: [batch, 2*hidden].
+    final_doc_rep_1 = tf.concat(
+        [tf.reduce_sum(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1],
+        axis=1)
+    final_doc_rep_2 = tf.concat(
+        [tf.reduce_sum(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2],
+        axis=1)
+  elif final_doc_rep_combine_mode == constants.DOC_COMBINE_MEAN_CONCAT:
+    final_doc_rep_1 = tf.concat(
+        [tf.reduce_mean(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1],
+        axis=1)
+    final_doc_rep_2 = tf.concat(
+        [tf.reduce_mean(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2],
+        axis=1)
+  elif final_doc_rep_combine_mode == constants.DOC_COMBINE_ATTENTION:
+    final_doc_rep_1 = tf.concat([
+        layers.get_attention_weighted_sum(
+            input_sent_reps_doc_1_unmask, bert_config, is_training,
+            dual_encoder_config.encoder_config.doc_rep_combine_attention_size),
+        normalized_doc_rep_1
+    ],
+                                axis=1)
+    final_doc_rep_2 = tf.concat([
+        layers.get_attention_weighted_sum(
+            input_sent_reps_doc_2_unmask, bert_config, is_training,
+            dual_encoder_config.encoder_config.doc_rep_combine_attention_size),
+        normalized_doc_rep_2
+    ],
+                                axis=1)
+  else:
+    raise ValueError("Only normal, sum_concat, mean_concat and attention are"
+                     " supported: %s" % final_doc_rep_combine_mode)
+  (siamese_loss, siamese_example_loss,
+   siamese_logits) = loss_fns.get_prediction_loss_cosine(
+       input_tensor_1=final_doc_rep_1,
+       input_tensor_2=final_doc_rep_2,
+       labels=documents_match_labels,
+       similarity_score_amplifier=dual_encoder_config.loss_config
+       .similarity_score_amplifier,
+       neg_to_pos_example_ratio=dual_encoder_config.train_eval_config
+       .neg_to_pos_example_ratio)
+
+  # The shape of masked_lm_loss_doc is [1].
+  # The shape of masked_lm_example_loss_doc is [batch * max_predictions_per_seq,
+  # max_doc_length_by_sentence].
+  return (masked_lm_loss_doc_1, masked_lm_loss_doc_2,
+          masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2,
+          masked_lm_weights_doc_1, masked_lm_weights_doc_2,
+          masked_sent_lm_loss_1, masked_sent_lm_loss_2,
+          masked_sent_per_example_loss_1, masked_sent_per_example_loss_2,
+          masked_sent_weight_1, masked_sent_weight_2, final_doc_rep_1,
+          final_doc_rep_2, input_sent_reps_doc_1_unmask,
+          input_sent_reps_doc_2_unmask, output_sent_reps_doc_1,
+          output_sent_reps_doc_2, siamese_loss, siamese_example_loss,
+          siamese_logits)
+
+
+def model_fn_builder(dual_encoder_config,
+                     train_mode,
+                     learning_rate,
+                     num_train_steps,
+                     num_warmup_steps,
+                     use_tpu,
+                     use_one_hot_embeddings,
+                     debugging=False):
+  """Returns `model_fn` closure for TPUEstimator."""
+
+  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+    """The `model_fn` for TPUEstimator."""
+    tf.logging.info("*** Current mode: %s ***" % mode)
+    tf.logging.info("*** Features ***")
+    for name in sorted(features.keys()):
+      tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
+
+    input_ids_1 = features["input_ids_1"]
+    input_mask_1 = features["input_mask_1"]
+    if train_mode == constants.TRAIN_MODE_FINETUNE:
+      masked_lm_positions_1 = tf.zeros([1])
+      masked_lm_ids_1 = tf.zeros([1])
+      masked_lm_weights_1 = tf.zeros([1])
+    else:
+      masked_lm_positions_1 = features["masked_lm_positions_1"]
+      masked_lm_ids_1 = features["masked_lm_ids_1"]
+      masked_lm_weights_1 = features["masked_lm_weights_1"]
+
+    input_ids_2 = features["input_ids_2"]
+    input_mask_2 = features["input_mask_2"]
+    if train_mode == constants.TRAIN_MODE_FINETUNE:
+      masked_lm_positions_2 = tf.zeros([1])
+      masked_lm_ids_2 = tf.zeros([1])
+      masked_lm_weights_2 = tf.zeros([1])
+    else:
+      masked_lm_positions_2 = features["masked_lm_positions_2"]
+      masked_lm_ids_2 = features["masked_lm_ids_2"]
+      masked_lm_weights_2 = features["masked_lm_weights_2"]
+    documents_match_labels = features["documents_match_labels"]
+    # Since the document_match_labels might contain labels like 0/1/2, we need
+    # to transfer these labels to binary labels like 0/1.
+    documents_match_labels = tf.cast(documents_match_labels > 0, tf.float32)
+    is_real_example = None
+    if "is_real_example" in features:
+      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
+    else:
+      is_real_example = tf.ones(
+          tf.shape(documents_match_labels), dtype=tf.float32)
+
+    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+    if (dual_encoder_config.encoder_config.model_name ==
+        constants.MODEL_NAME_SMITH_DUAL_ENCODER):
+      # For the smith model, since the actual looped number of sentences per
+      # document maybe smaller than max_doc_length_by_sentence, we need to
+      # overwrite the lm weights with the actual lm weights returned by the
+      # function.
+      (masked_lm_loss_1, masked_lm_loss_2, masked_lm_example_loss_1,
+       masked_lm_example_loss_2, masked_lm_weights_1, masked_lm_weights_2,
+       masked_sent_lm_loss_1, masked_sent_lm_loss_2,
+       masked_sent_per_example_loss_1, masked_sent_per_example_loss_2,
+       masked_sent_weight_1, masked_sent_weight_2, seq_embed_1, seq_embed_2,
+       input_sent_embed_1, input_sent_embed_2, output_sent_embed_1,
+       output_sent_embed_2, siamese_loss,
+       siamese_example_loss, siamese_logits) = build_smith_dual_encoder(
+           dual_encoder_config, train_mode, is_training, input_ids_1,
+           input_mask_1, masked_lm_positions_1, masked_lm_ids_1,
+           masked_lm_weights_1, input_ids_2, input_mask_2,
+           masked_lm_positions_2, masked_lm_ids_2, masked_lm_weights_2,
+           use_one_hot_embeddings, documents_match_labels, debugging)
+    else:
+      raise ValueError(
+          "Only smith_dual_encoder is supported: %s" %
+          dual_encoder_config.encoder_config.model_name)
+
+    # There are three different modes for training in the smith model.
+    # 1. joint_train: a multi-task learning setting which combines the masked
+    # word LM losses for doc1/doc2 and the siamese matching loss. If we add the
+    # masked sentence LM task, we also add the masked sentence LM losses for
+    # the two documents.
+    # 2. pretrain: only contains the masked word LM losses for doc1/doc2. We
+    # currently didn't include the NSP loss since NSP loss is not very useful
+    # according to the XLNet/ RoBERTa/ ALBERT paper. If we add the masked
+    # sentence LM task, we also add the masked sentence LM losses for the
+    # two documents.
+    # 3. finetune: fine tune the model with loaded pretrained checkpoint only
+    # with the siamese matching loss. If we add the masked sentence LM task,
+    # we also add the masked sentence LM losses for the two documents.
+    if train_mode == constants.TRAIN_MODE_JOINT_TRAIN:
+      total_loss = masked_lm_loss_1 + masked_lm_loss_2 + siamese_loss
+    elif train_mode == constants.TRAIN_MODE_PRETRAIN:
+      total_loss = masked_lm_loss_1 + masked_lm_loss_2
+    elif train_mode == constants.TRAIN_MODE_FINETUNE:
+      total_loss = siamese_loss
+    else:
+      raise ValueError("Only joint_train, pretrain, finetune are supported.")
+    # If we add the masked sentence LM task, we also add the masked sentence
+    # LM losses for the two documents.
+    if dual_encoder_config.encoder_config.use_masked_sentence_lm_loss:
+      total_loss += (masked_sent_lm_loss_1 + masked_sent_lm_loss_2)
+
+    total_loss = tf.identity(total_loss, name='total_loss')
+    
+    tvars = tf.trainable_variables()
+    initialized_variable_names = {}
+    scaffold_fn = None
+    init_checkpoint = dual_encoder_config.encoder_config.init_checkpoint
+    # Load pretrained BERT checkpoints if there is a specified path.
+    if init_checkpoint:
+      tf.logging.info("**** Passed pretrained BERT checkpoint = %s ****",
+                      init_checkpoint)
+      (assignment_map, initialized_variable_names
+      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+      if use_tpu:
+
+        def tpu_scaffold():
+          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+          return tf.train.Scaffold()
+
+        scaffold_fn = tpu_scaffold
+      else:
+        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+    tf.logging.info("**** Trainable Variables ****")
+    for var in tvars:
+      init_string = ", *INIT_RANDOMLY*"
+      if var.name in initialized_variable_names:
+        init_string = ", *INIT_FROM_CKPT*"
+      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                      init_string)
+    output_spec = None
+    predicted_score = tf.sigmoid(siamese_logits)
+    predicted_class = tf.round(predicted_score)
+
+    if dual_encoder_config.encoder_config.model_name == constants.MODEL_NAME_SMITH_DUAL_ENCODER:
+      _, prediction_dict = utils.get_export_outputs_prediction_dict_smith_de(
+          seq_embed_1, seq_embed_2, predicted_score, predicted_class,
+          documents_match_labels, input_sent_embed_1, input_sent_embed_2,
+          output_sent_embed_1, output_sent_embed_2)
+    else:
+      raise ValueError("Unsupported model: %s" % dual_encoder_config.encoder_config.model_name)
+
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      train_op = optimization.create_optimizer(total_loss, learning_rate,
+                                               num_train_steps,
+                                               num_warmup_steps, use_tpu)
+      # Add by:TC 20220705
+      output_spec = tf.estimator.EstimatorSpec(
+                    mode=mode, 
+                    loss=total_loss, 
+                    train_op=train_op, 
+                    training_hooks=[npu_tf_config.estimator_dump()])
+
+    elif mode == tf.estimator.ModeKeys.EVAL:
+      if (train_mode == constants.TRAIN_MODE_JOINT_TRAIN or
+          train_mode == constants.TRAIN_MODE_PRETRAIN):
+        eval_metrics = (metric_fns.metric_fn_pretrain, [
+            masked_lm_example_loss_1, masked_lm_weights_1,
+            masked_sent_per_example_loss_1, masked_sent_weight_1,
+            masked_lm_example_loss_2, masked_lm_weights_2,
+            masked_sent_per_example_loss_2, masked_sent_weight_2,
+            predicted_class, documents_match_labels, is_real_example
+        ])
+      elif train_mode == constants.TRAIN_MODE_FINETUNE:
+        eval_metrics = (metric_fns.metric_fn_finetune, [
+            predicted_class, documents_match_labels, siamese_example_loss,
+            is_real_example
+        ])
+      else:
+        raise ValueError("Only joint_train, pretrain, finetune are supported.")
+      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          eval_metrics=eval_metrics,
+          scaffold_fn=scaffold_fn)
+
+    elif mode == tf.estimator.ModeKeys.PREDICT:
+      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
+          mode=mode, predictions=prediction_dict, scaffold_fn=scaffold_fn)
+    else:
+      raise ValueError("Only TRAIN, EVAL, PREDICT modes are supported: %s" % mode)
+
+    return output_spec
+
+  return model_fn
+
diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py
new file mode 100644
index 000000000..2818aabc8
--- /dev/null
+++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py
@@ -0,0 +1,547 @@
+# coding=utf-8
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Library to preprocess text data into SMITH dual encoder model inputs."""
+from npu_bridge.npu_init import *
+import collections
+import random
+import nltk
+import tensorflow.compat.v1 as tf
+import tqdm
+from smith import utils
+from smith import wiki_doc_pair_pb2
+from smith.bert import tokenization
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None, "Input data path.")
+
+flags.DEFINE_string(
+    "output_file", None,
+    "Output TF examples (or comma-separated list of files) in TFRecord "
+    "files.")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the SMITH model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_bool("add_masks_lm", True,
+                  "If true, add masks for word prediction LM pre-training.")
+
+flags.DEFINE_integer(
+    "max_sent_length_by_word", 32, "The maximum length of a sentence by tokens."
+    "A sentence will be cut off if longer than this length, and will be padded "
+    "if shorter than it. The sentence can also be a sentence block.")
+
+flags.DEFINE_integer(
+    "max_doc_length_by_sentence", 64,
+    "The maximum length of a document by sentences. A "
+    "document will be cut off if longer than this length, and"
+    "will be padded if shorter than it.")
+
+flags.DEFINE_bool(
+    "greedy_sentence_filling", True,
+    "If true, apply the greedy sentence filling trick to reduce the "
+    "number of padded tokens.")
+
+flags.DEFINE_integer("max_predictions_per_seq", 5,
+                     "Maximum number of masked LM predictions per sequence.")
+
+flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
+
+flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
+
+
+class TrainingInstance(object):
+  """A single training instance (sentence pair as dual encoder model inputs)."""
+
+  def __init__(self,
+               tokens_1,
+               segment_ids_1,
+               masked_lm_positions_1,
+               masked_lm_labels_1,
+               input_mask_1,
+               masked_lm_weights_1,
+               tokens_2,
+               segment_ids_2,
+               masked_lm_positions_2,
+               masked_lm_labels_2,
+               input_mask_2,
+               masked_lm_weights_2,
+               instance_id,
+               documents_match_labels=-1.0):
+    self.tokens_1 = tokens_1
+    self.segment_ids_1 = segment_ids_1
+    self.masked_lm_positions_1 = masked_lm_positions_1
+    self.masked_lm_labels_1 = masked_lm_labels_1
+    self.input_mask_1 = input_mask_1
+    self.masked_lm_weights_1 = masked_lm_weights_1
+    self.tokens_2 = tokens_2
+    self.segment_ids_2 = segment_ids_2
+    self.masked_lm_positions_2 = masked_lm_positions_2
+    self.masked_lm_labels_2 = masked_lm_labels_2
+    self.input_mask_2 = input_mask_2
+    self.masked_lm_weights_2 = masked_lm_weights_2
+    self.instance_id = instance_id
+    self.documents_match_labels = documents_match_labels
+
+  def __str__(self):
+    s = ""
+    s += "instance_id: %s\n" % self.instance_id
+    s += "documents_match_labels: %s\n" % (str(self.documents_match_labels))
+    s += "tokens_1: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.tokens_1]))
+    s += "segment_ids_1: %s\n" % (" ".join([str(x) for x in self.segment_ids_1
+                                           ]))
+    s += "masked_lm_positions_1: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_positions_1]))
+    s += "masked_lm_labels_1: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.masked_lm_labels_1]))
+    s += "input_mask_1: %s\n" % (" ".join([str(x) for x in self.input_mask_1]))
+    s += "masked_lm_weights_1: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_weights_1]))
+    s += "tokens_2: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.tokens_2]))
+    s += "segment_ids_2: %s\n" % (" ".join([str(x) for x in self.segment_ids_2
+                                           ]))
+    s += "masked_lm_positions_2: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_positions_2]))
+    s += "masked_lm_labels_2: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.masked_lm_labels_2]))
+    s += "input_mask_2: %s\n" % (" ".join([str(x) for x in self.input_mask_2]))
+    s += "masked_lm_weights_2: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_weights_2]))
+    s += "\n"
+    return s
+
+  def __repr__(self):
+    return self.__str__()
+
+
+def add_features_for_one_doc(features, tokens, segment_ids, input_mask,
+                             masked_lm_positions, masked_lm_labels,
+                             masked_lm_weights, tokenizer, doc_index):
+  """Add features for one document in a WikiDocPair example."""
+  input_ids = tokenizer.convert_tokens_to_ids(tokens)
+  features["input_ids_" + doc_index] = utils.create_int_feature(input_ids)
+  features["input_mask_" + doc_index] = utils.create_int_feature(input_mask)
+  features["segment_ids_" + doc_index] = utils.create_int_feature(segment_ids)
+
+  if masked_lm_labels:
+    masked_lm_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)
+    features["masked_lm_positions_" +
+             doc_index] = utils.create_int_feature(masked_lm_positions)
+    features["masked_lm_ids_" +
+             doc_index] = utils.create_int_feature(masked_lm_ids)
+    features["masked_lm_weights_" +
+             doc_index] = utils.create_float_feature(masked_lm_weights)
+
+
+def write_instance_to_example_files(instances, tokenizer, output_files):
+  """Create TF example files from `TrainingInstance`s."""
+  writers = []
+  for output_file in output_files:
+    writers.append(tf.python_io.TFRecordWriter(output_file))
+  writer_index = 0
+  total_written = 0
+  for (inst_index, instance) in enumerate(instances):
+    features = collections.OrderedDict()
+    add_features_for_one_doc(
+        features=features,
+        tokens=instance.tokens_1,
+        segment_ids=instance.segment_ids_1,
+        input_mask=instance.input_mask_1,
+        masked_lm_positions=instance.masked_lm_positions_1,
+        masked_lm_labels=instance.masked_lm_labels_1,
+        masked_lm_weights=instance.masked_lm_weights_1,
+        tokenizer=tokenizer,
+        doc_index="1")
+    add_features_for_one_doc(
+        features=features,
+        tokens=instance.tokens_2,
+        segment_ids=instance.segment_ids_2,
+        input_mask=instance.input_mask_2,
+        masked_lm_positions=instance.masked_lm_positions_2,
+        masked_lm_labels=instance.masked_lm_labels_2,
+        masked_lm_weights=instance.masked_lm_weights_2,
+        tokenizer=tokenizer,
+        doc_index="2")
+    # Adds fields on more content/id information of the current example.
+    features["instance_id"] = utils.create_bytes_feature(
+        [bytes(instance.instance_id, "utf-8")])
+    features["tokens_1"] = utils.create_bytes_feature(
+        [bytes(t, "utf-8") for t in instance.tokens_1])
+    features["tokens_2"] = utils.create_bytes_feature(
+        [bytes(t, "utf-8") for t in instance.tokens_2])
+    # Adds the documents matching labels.
+    features["documents_match_labels"] = utils.create_float_feature(
+        [float(instance.documents_match_labels)])
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+
+    writers[writer_index].write(tf_example.SerializeToString())
+    writer_index = (writer_index + 1) % len(writers)
+
+    total_written += 1
+
+    if inst_index < 5:
+      tf.logging.info("*** Example ***")
+      tf.logging.info(
+          "tokens_1: %s" %
+          " ".join([tokenization.printable_text(x) for x in instance.tokens_1]))
+      tf.logging.info(
+          "tokens_2: %s" %
+          " ".join([tokenization.printable_text(x) for x in instance.tokens_2]))
+
+      for feature_name in features.keys():
+        feature = features[feature_name]
+        values = []
+        if feature.int64_list.value:
+          values = feature.int64_list.value
+        elif feature.float_list.value:
+          values = feature.float_list.value
+        elif feature.bytes_list.value:
+          values = feature.bytes_list.value
+        tf.logging.info("%s: %s" %
+                        (feature_name, " ".join([str(x) for x in values])))
+
+  for writer in writers:
+    writer.close()
+
+  tf.logging.info("Wrote %d total instances", total_written)
+
+
+def get_smith_model_tokens(input_text, tokenizer, sent_token_counter):
+  """Generate tokens given an input text for the SMITH model."""
+  res_tokens = []
+  for sent in nltk.tokenize.sent_tokenize(input_text):
+    # The returned res_tokens is a 2D list to maintain the sentence boundary
+    # information. We removed all the empty tokens in this step.
+    if not sent:
+      continue
+    tokens = [w for w in tokenizer.tokenize(sent) if w]
+    sent_token_counter[0] += 1  # Track number of sentences.
+    sent_token_counter[1] += len(tokens)  # Track number of tokens.
+    res_tokens.append(tokens)
+  return (res_tokens, sent_token_counter)
+
+
+def create_training_instances_wiki_doc_pair(
+    input_file, tokenizer, max_sent_length_by_word, max_doc_length_by_sentence,
+    masked_lm_prob, max_predictions_per_seq, rng):
+  """Create `TrainingInstance`s from WikiDocPair proto data."""
+  # The input data is in the WikiDocPair proto format in tfrecord.
+  # Add by:TC
+  wiki_doc_pair = wiki_doc_pair_pb2.WikiDocPair()
+  instances = []
+  # Add some counters to track some data statistics.
+  sent_token_counter = [0, 0]
+  for example in tqdm.tqdm(tf.python_io.tf_record_iterator(input_file)):
+    doc_pair = wiki_doc_pair.FromString(example)
+    # If model_name = smith_dual_encoder, we firstly use a sentence tokenizer
+    # to split doc_one/doc_two texts into different sentences and use [SEN] to
+    # label the sentence boundary information. So in the masking and padding
+    # step, we know the boundary between different sentences and we can do the
+    # masking and padding according to the actual length of each sentence.
+    doc_one_text = " \n\n\n\n\n\n ".join(
+        [a.text for a in doc_pair.doc_one.section_contents])
+    doc_two_text = " \n\n\n\n\n\n ".join(
+        [a.text for a in doc_pair.doc_two.section_contents])
+    doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip()
+    doc_two_text = tokenization.convert_to_unicode(doc_two_text).strip()
+    doc_one_tokens, sent_token_counter = get_smith_model_tokens(
+        doc_one_text, tokenizer, sent_token_counter)
+    doc_two_tokens, sent_token_counter = get_smith_model_tokens(
+        doc_two_text, tokenizer, sent_token_counter)
+    # Skip the document pairs if any document is empty.
+    if not doc_one_tokens or not doc_two_tokens:
+      continue
+    vocab_words = list(tokenizer.vocab.keys())
+    instance_id = doc_pair.id
+    if doc_pair.human_label_for_classification:
+      doc_match_label = doc_pair.human_label_for_classification
+    else:
+      # Set the label as 0.0 if there are no available labels.
+      doc_match_label = 0.0
+    instances.append(
+        create_instance_from_wiki_doc_pair(
+            instance_id, doc_match_label, doc_one_tokens, doc_two_tokens,
+            max_sent_length_by_word, max_doc_length_by_sentence, masked_lm_prob,
+            max_predictions_per_seq, vocab_words, rng))
+  rng.shuffle(instances)
+  return (instances, sent_token_counter)
+
+
+def create_instance_from_wiki_doc_pair(instance_id, doc_match_label,
+                                       doc_one_tokens, doc_two_tokens,
+                                       max_sent_length_by_word,
+                                       max_doc_length_by_sentence,
+                                       masked_lm_prob, max_predictions_per_seq,
+                                       vocab_words, rng):
+  """Creates `TrainingInstance`s for a WikiDocPair input data."""
+  (tokens_1, segment_ids_1, masked_lm_positions_1, masked_lm_labels_1, \
+   input_mask_1, masked_lm_weights_1) = \
+      get_tokens_segment_ids_masks(max_sent_length_by_word, max_doc_length_by_sentence, doc_one_tokens, masked_lm_prob,
+                                   max_predictions_per_seq, vocab_words, rng)
+  (tokens_2, segment_ids_2, masked_lm_positions_2, masked_lm_labels_2, \
+   input_mask_2, masked_lm_weights_2) = \
+      get_tokens_segment_ids_masks(max_sent_length_by_word, max_doc_length_by_sentence, doc_two_tokens, masked_lm_prob,
+                                   max_predictions_per_seq, vocab_words, rng)
+  instance = TrainingInstance(
+      tokens_1=tokens_1,
+      segment_ids_1=segment_ids_1,
+      masked_lm_positions_1=masked_lm_positions_1,
+      masked_lm_labels_1=masked_lm_labels_1,
+      input_mask_1=input_mask_1,
+      masked_lm_weights_1=masked_lm_weights_1,
+      tokens_2=tokens_2,
+      segment_ids_2=segment_ids_2,
+      masked_lm_positions_2=masked_lm_positions_2,
+      masked_lm_labels_2=masked_lm_labels_2,
+      input_mask_2=input_mask_2,
+      masked_lm_weights_2=masked_lm_weights_2,
+      instance_id=instance_id,
+      documents_match_labels=doc_match_label)
+  return instance
+
+
+def get_tokens_segment_ids_masks(max_sent_length_by_word,
+                                 max_doc_length_by_sentence, doc_one_tokens,
+                                 masked_lm_prob, max_predictions_per_seq,
+                                 vocab_words, rng):
+  """Get the tokens, segment ids and masks of an input sequence."""
+  # The format of tokens for SMITH dual encoder models is like:
+  # [CLS] block1_token1 block1_token2 block1_token3 ... [SEP] [SEP] [PAD] ...
+  # [CLS] block2_token1 block2_token2 block2_token3 ... [SEP] [SEP] [PAD] ...
+  # [CLS] block3_token1 block3_token2 block3_token3 ... [SEP] [SEP] [PAD] ...
+  # If max_sent_length_by_word is large, then there will be many padded
+  # words in the sentence. Here we added an optional "greedy sentence filling"
+  # trick in order to reduce the number of padded words and maintain all
+  # content in the document. We allow a "sentence" block to contain more than
+  # one natural sentence and try to fill as many as sentences into the
+  # "sentence" block. If a sentence will be cut off and the current sentence
+  # block is not empty, we will put the sentence into the next "sentence" block.
+  # According to ALBERT paper and RoBERTa paper, a segment is usually comprised
+  # of more than one natural sentence, which has been shown to benefit
+  # performance. doc_one_tokens is a 2D list which contains the sentence
+  # boundary information.
+  sentence_num = len(doc_one_tokens)
+  # sent_block_token_list is a 2D list to maintain sentence block tokens.
+  sent_block_token_list = []
+  natural_sentence_index = -1
+  while natural_sentence_index + 1 < sentence_num:
+    natural_sentence_index += 1
+    sent_tokens = doc_one_tokens[natural_sentence_index]
+    if not sent_tokens:
+      continue
+    if FLAGS.greedy_sentence_filling:
+      cur_sent_block_length = 0
+      cur_sent_block = []
+      # Fill as many senteces as possible in the current sentence block in a
+      # greedy way.
+      while natural_sentence_index < sentence_num:
+        cur_natural_sent_tokens = doc_one_tokens[natural_sentence_index]
+        if not cur_natural_sent_tokens:
+          natural_sentence_index += 1
+          continue
+        cur_sent_len = len(cur_natural_sent_tokens)
+        if ((cur_sent_block_length + cur_sent_len) <=
+            (max_sent_length_by_word - 3)) or cur_sent_block_length == 0:
+          # One exceptional case here is that if the 1st sentence of a sentence
+          # block is already going across the boundary, then the current
+          # sentence block will be empty. So when cur_sent_block_length is 0
+          # and we meet a natural sentence with length longer than
+          # (max_sent_length_by_word - 3), we still put this natural sentence
+          # in the current sentence block. In this case, this long natural
+          # sentence will be cut off with the final length up to
+          # (max_sent_length_by_word - 3).
+          cur_sent_block.extend(cur_natural_sent_tokens)
+          cur_sent_block_length += cur_sent_len
+          natural_sentence_index += 1
+        else:
+          # If cur_sent_block_length + cur_sent_len > max_sent_length_by_word-3
+          # and the current sentence block is not empty, the sentence which
+          # goes across the boundary will be put into the next sentence block.
+          natural_sentence_index -= 1
+          break
+    sent_tokens = cur_sent_block
+    sent_block_token_list.append(sent_tokens)
+    if len(sent_block_token_list) >= max_doc_length_by_sentence:
+      break  # Skip more sentence blocks if the document is too long.
+  # For each sentence block, generate the token sequences, masks and paddings.
+  tokens_doc = []
+  segment_ids_doc = []
+  masked_lm_positions_doc = []
+  masked_lm_labels_doc = []
+  input_mask_doc = []
+  masked_lm_weights_doc = []
+  for block_index in range(len(sent_block_token_list)):
+    tokens_block, segment_ids_block, masked_lm_positions_block, \
+    masked_lm_labels_block, input_mask_block, masked_lm_weights_block = \
+        get_token_masks_paddings(
+            sent_block_token_list[block_index],
+            max_sent_length_by_word,
+            masked_lm_prob,
+            max_predictions_per_seq,
+            vocab_words,
+            rng,
+            block_index)
+    tokens_doc.extend(tokens_block)
+    segment_ids_doc.extend(segment_ids_block)
+    masked_lm_positions_doc.extend(masked_lm_positions_block)
+    masked_lm_labels_doc.extend(masked_lm_labels_block)
+    input_mask_doc.extend(input_mask_block)
+    masked_lm_weights_doc.extend(masked_lm_weights_block)
+
+  # Pad sentence blocks if the actual number of sentence blocks is less than
+  # max_doc_length_by_sentence.
+  sentence_block_index = len(sent_block_token_list)
+  while sentence_block_index < max_doc_length_by_sentence:
+    for _ in range(max_sent_length_by_word):
+      tokens_doc.append("[PAD]")
+      segment_ids_doc.append(0)
+      input_mask_doc.append(0)
+    for _ in range(max_predictions_per_seq):
+      masked_lm_positions_doc.append(0)
+      masked_lm_labels_doc.append("[PAD]")
+      masked_lm_weights_doc.append(0.0)
+    sentence_block_index += 1
+  assert len(tokens_doc) == max_sent_length_by_word * max_doc_length_by_sentence
+  assert len(masked_lm_labels_doc
+            ) == max_predictions_per_seq * max_doc_length_by_sentence
+  return (tokens_doc, segment_ids_doc, masked_lm_positions_doc,
+          masked_lm_labels_doc, input_mask_doc, masked_lm_weights_doc)
+
+
+def get_token_masks_paddings(block_tokens, max_sent_length_by_word,
+                             masked_lm_prob, max_predictions_per_seq,
+                             vocab_words, rng, block_index):
+  """Generates tokens, masks and paddings for the input block tokens."""
+  # Account for [CLS], [SEP], [SEP]
+  max_num_tokens = max_sent_length_by_word - 3
+  # Truncates the sequence if sequence length is longer than max_num_tokens.
+  tokens = []
+  segment_ids = []
+  if len(block_tokens) > max_num_tokens:
+    block_tokens = block_tokens[0:max_num_tokens]
+  tokens_a = block_tokens
+  tokens.append("[CLS]")
+  segment_ids.append(0)
+  for token in tokens_a:
+    tokens.append(token)
+    segment_ids.append(0)
+  tokens.append("[SEP]")
+  segment_ids.append(0)
+  tokens.append("[SEP]")
+  segment_ids.append(0)
+  masked_lm_positions = []
+  masked_lm_labels = []
+  masked_lm_weights = []
+  if max_predictions_per_seq > 0:
+    (tokens, masked_lm_positions,
+     masked_lm_labels) = utils.create_masked_lm_predictions(
+         tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+  # Add [PAD] to tokens and masked LM related lists.
+  input_mask = [1] * len(tokens)
+  while len(tokens) < max_sent_length_by_word:
+    tokens.append("[PAD]")
+    input_mask.append(0)
+    segment_ids.append(0)
+
+  assert len(tokens) == max_sent_length_by_word
+  assert len(input_mask) == max_sent_length_by_word
+  assert len(segment_ids) == max_sent_length_by_word
+
+  if max_predictions_per_seq > 0:
+    # Transfer local positions in masked_lm_positions to global positions in the
+    # whole document to be consistent with the model training pipeline.
+    masked_lm_positions = [
+        (i + max_sent_length_by_word * block_index) for i in masked_lm_positions
+    ]
+    masked_lm_weights = [1.0] * len(masked_lm_labels)
+
+    while len(masked_lm_positions) < max_predictions_per_seq:
+      masked_lm_positions.append(0)
+      masked_lm_labels.append("[PAD]")
+      masked_lm_weights.append(0.0)
+  return (tokens, segment_ids, masked_lm_positions, masked_lm_labels,
+          input_mask, masked_lm_weights)
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  input_files = []
+  for input_pattern in FLAGS.input_file.split(","):
+    input_files.extend(tf.gfile.Glob(input_pattern))
+
+  tf.logging.info("*** Reading from input files ***")
+  for input_file in input_files:
+    tf.logging.info("  %s", input_file)
+  rng = random.Random(FLAGS.random_seed)
+  # Creates training instances.
+  max_predictions_per_seq = FLAGS.max_predictions_per_seq if FLAGS.add_masks_lm else 0
+  masked_lm_prob = FLAGS.masked_lm_prob if FLAGS.add_masks_lm else 0
+  instances, sent_token_counter = create_training_instances_wiki_doc_pair(
+      input_file=FLAGS.input_file,
+      tokenizer=tokenizer,
+      max_sent_length_by_word=FLAGS.max_sent_length_by_word,
+      max_doc_length_by_sentence=FLAGS.max_doc_length_by_sentence,
+      masked_lm_prob=masked_lm_prob,
+      max_predictions_per_seq=max_predictions_per_seq,
+      rng=rng)
+
+  output_files = FLAGS.output_file.split(",")
+  tf.logging.info("*** Writing to output files ***")
+  for output_file in output_files:
+    tf.logging.info("  %s", output_file)
+
+  # Transfers training instances into tensorflow examples and write the results.
+  write_instance_to_example_files(instances, tokenizer, output_files)
+
+  # Finally outputs some data statistics.
+  tf.logging.info("sent_count, token_count, doc_pair_count: %d %d %d",
+                  sent_token_counter[0], sent_token_counter[1], len(instances))
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("output_file")
+  flags.mark_flag_as_required("vocab_file")
+  tf.app.run()
+
diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test.py
index f4b828c75..4a70f3736 100644
--- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test.py
+++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The Google Research Authors.
+# Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,8 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# ==============================================================================
 
-# Copyright 2021 Huawei Technologies Co., Ltd
+# Copyright 2021 The Google Research Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,7 +27,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
 
 from npu_bridge.npu_init import *
 import random
@@ -35,8 +35,8 @@ import tempfile
 from absl import flags
 import tensorflow.compat.v1 as tf
 
-from smith import preprocessing_smith
-from smith.bert import tokenization
+from smith_npu_20220702105238 import preprocessing_smith
+from smith_npu_20220702105238.bert import tokenization
 
 FLAGS = flags.FLAGS
 
@@ -72,8 +72,11 @@ class PreprocessingSmithTest(tf.test.TestCase):
     self.masked_lm_prob = 0
 
   def test_get_tokens_segment_ids_masks(self):
-    (tokens_1, segment_ids_1, _, _, input_mask_1, _) = \
-    preprocessing_smith.get_tokens_segment_ids_masks(
+
+    tokens_segment_ids_masks = preprocessing_smith.GetTokensSegmentIdsMasks()
+
+    tokens_segment_ids_masks_res = \
+    tokens_segment_ids_masks.get_tokens_segment_ids_masks(
         max_sent_length_by_word=self.max_sent_length_by_word,
         max_doc_length_by_sentence=self.max_doc_length_by_sentence,
         doc_one_tokens=self.doc_one_tokens,
@@ -81,6 +84,10 @@ class PreprocessingSmithTest(tf.test.TestCase):
         max_predictions_per_seq=self.max_predictions_per_seq,
         vocab_words=self.vocab_words,
         rng=self.rng)
+
+    tokens_1, segment_ids_1, _, _, input_mask_1, _ = tokens_segment_ids_masks_res.tokens_doc, tokens_segment_ids_masks_res.segment_ids_doc, tokens_segment_ids_masks_res.masked_lm_positions_doc,\
+                                                       tokens_segment_ids_masks_res.masked_lm_labels_doc, tokens_segment_ids_masks_res.input_mask_doc, tokens_segment_ids_masks_res.masked_lm_weights_doc
+
     self.assertEqual(tokens_1, [
         "[CLS]", "i", "am", "in", "[UNK]", "[UNK]", "[UNK]", "for", "my",
         "dinner", "[UNK]", "ok", ",", "no", "problem", "[UNK]", "[SEP]",
diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py
new file mode 100644
index 000000000..f4b828c75
--- /dev/null
+++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from npu_bridge.npu_init import *
+import random
+import tempfile
+
+from absl import flags
+import tensorflow.compat.v1 as tf
+
+from smith import preprocessing_smith
+from smith.bert import tokenization
+
+FLAGS = flags.FLAGS
+
+
+class PreprocessingSmithTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(PreprocessingSmithTest, self).setUp()
+    doc_one_text = (
+        "I am in Dominick's for my dinner. OK, no problem. I am "
+        "in Dominick's for my dinner which is the best dinner I have "
+        "in my whole life.")
+    doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip()
+    vocab_tokens = [
+        "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "i", "am", "in", "for",
+        "my", "dinner", "ok", "no", "problem", "which", "is", "the", "be",
+        "##s", "##t", ","
+    ]
+    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
+      vocab_writer.write("".join([x + "\n" for x in vocab_tokens
+                                 ]).encode("utf-8"))
+      self.vocab_file = vocab_writer.name
+    self.tokenizer = tokenization.FullTokenizer(
+        vocab_file=self.vocab_file, do_lower_case=True)
+    self.vocab_words = list(self.tokenizer.vocab.keys())
+    self.rng = random.Random(12345)
+    self.doc_one_tokens, _ = preprocessing_smith.get_smith_model_tokens(
+        doc_one_text, self.tokenizer, [0, 0])
+    self.max_sent_length_by_word = 20
+    self.max_doc_length_by_sentence = 3
+    self.greedy_sentence_filling = True
+    self.max_predictions_per_seq = 0
+    self.masked_lm_prob = 0
+
+  def test_get_tokens_segment_ids_masks(self):
+    (tokens_1, segment_ids_1, _, _, input_mask_1, _) = \
+    preprocessing_smith.get_tokens_segment_ids_masks(
+        max_sent_length_by_word=self.max_sent_length_by_word,
+        max_doc_length_by_sentence=self.max_doc_length_by_sentence,
+        doc_one_tokens=self.doc_one_tokens,
+        masked_lm_prob=self.masked_lm_prob,
+        max_predictions_per_seq=self.max_predictions_per_seq,
+        vocab_words=self.vocab_words,
+        rng=self.rng)
+    self.assertEqual(tokens_1, [
+        "[CLS]", "i", "am", "in", "[UNK]", "[UNK]", "[UNK]", "for", "my",
+        "dinner", "[UNK]", "ok", ",", "no", "problem", "[UNK]", "[SEP]",
+        "[SEP]", "[PAD]", "[PAD]", "[CLS]", "i", "am", "in", "[UNK]", "[UNK]",
+        "[UNK]", "for", "my", "dinner", "which", "is", "the", "be", "##s",
+        "##t", "dinner", "i", "[SEP]", "[SEP]", "[PAD]", "[PAD]", "[PAD]",
+        "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]",
+        "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]",
+        "[PAD]"
+    ])
+    self.assertEqual(segment_ids_1, [
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    ])
+    self.assertEqual(input_mask_1, [
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    ])
+
+
+if __name__ == "__main__":
+  tf.test.main()
+
-- 
Gitee


From 6a2f73a8b449dcb90c69c7911c1897c97cf7bf17 Mon Sep 17 00:00:00 2001
From: QiuYao <qiuyao4@hisilicon.com>
Date: Tue, 27 Sep 2022 20:18:33 +0800
Subject: [PATCH 2/3] add atc code

---
 .../bert/modeling.py                          |  76 ++-
 .../layers_orig.py                            | 570 ------------------
 .../modeling_orig.py                          | 491 ---------------
 .../preprocessing_smith_orig.py               | 547 -----------------
 .../preprocessing_smith_test_orig.py          | 108 ----
 5 files changed, 44 insertions(+), 1748 deletions(-)
 delete mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py
 delete mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py
 delete mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py
 delete mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py

diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling.py
index 1ca7d4124..567aca741 100644
--- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling.py
+++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The Google Research Authors.
+# Copyright 2022 The Google Research Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +18,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from npu_bridge.npu_init import *
 
 import collections
 import copy
@@ -30,6 +29,8 @@ import six
 from six.moves import range
 import tensorflow.compat.v1 as tf
 import tf_slim as slim
+from npu_bridge.npu_init import *
+from npu_bridge.estimator.npu_unary_ops import npu_unary_ops
 
 
 class BertConfig(object):
@@ -42,8 +43,8 @@ class BertConfig(object):
                num_attention_heads=12,
                intermediate_size=3072,
                hidden_act="gelu",
-               hidden_dropout_prob=0.1,
-               attention_probs_dropout_prob=0.1,
+               # hidden_dropout_prob=0.1,
+               # attention_probs_dropout_prob=0.1,
                max_position_embeddings=512,
                type_vocab_size=16,
                initializer_range=0.02):
@@ -77,8 +78,8 @@ class BertConfig(object):
     self.num_attention_heads = num_attention_heads
     self.hidden_act = hidden_act
     self.intermediate_size = intermediate_size
-    self.hidden_dropout_prob = hidden_dropout_prob
-    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    # self.hidden_dropout_prob = hidden_dropout_prob
+    # self.attention_probs_dropout_prob = attention_probs_dropout_prob
     self.max_position_embeddings = max_position_embeddings
     self.type_vocab_size = type_vocab_size
     self.initializer_range = initializer_range
@@ -162,9 +163,9 @@ class DocBertModel(object):
         is invalid.
     """
     config = copy.deepcopy(config)
-    if not is_training:
-      config.hidden_dropout_prob = 0.0
-      config.attention_probs_dropout_prob = 0.0
+    # if not is_training:
+      # config.hidden_dropout_prob = 0.0
+      # config.attention_probs_dropout_prob = 0.0
 
     input_shape = get_shape_list(input_reps, expected_rank=3)
     batch_size = input_shape[0]
@@ -191,8 +192,7 @@ class DocBertModel(object):
             use_position_embeddings=True,
             position_embedding_name="position_embeddings",
             initializer_range=config.initializer_range,
-            max_position_embeddings=config.max_position_embeddings,
-            dropout_prob=config.hidden_dropout_prob)
+            max_position_embeddings=config.max_position_embeddings)
 
       with tf.variable_scope("doc_encoder"):
         # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
@@ -212,8 +212,8 @@ class DocBertModel(object):
             num_attention_heads=config.num_attention_heads,
             intermediate_size=config.intermediate_size,
             intermediate_act_fn=get_activation(config.hidden_act),
-            hidden_dropout_prob=config.hidden_dropout_prob,
-            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+            # hidden_dropout_prob=config.hidden_dropout_prob,
+            # attention_probs_dropout_prob=config.attention_probs_dropout_prob,
             initializer_range=config.initializer_range,
             do_return_all_layers=True)
 
@@ -327,9 +327,9 @@ class BertModel(object):
         is invalid.
     """
     config = copy.deepcopy(config)
-    if not is_training:
-      config.hidden_dropout_prob = 0.0
-      config.attention_probs_dropout_prob = 0.0
+    # if not is_training:
+    #   config.hidden_dropout_prob = 0.0
+    #   config.attention_probs_dropout_prob = 0.0
 
     input_shape = get_shape_list(input_ids, expected_rank=2)
     batch_size = input_shape[0]
@@ -365,7 +365,7 @@ class BertModel(object):
             position_embedding_name="position_embeddings",
             initializer_range=config.initializer_range,
             max_position_embeddings=config.max_position_embeddings,
-            dropout_prob=config.hidden_dropout_prob,
+            # dropout_prob=config.hidden_dropout_prob,
             trainable=sent_bert_trainable)
 
       with tf.variable_scope("encoder"):
@@ -385,8 +385,8 @@ class BertModel(object):
             num_attention_heads=config.num_attention_heads,
             intermediate_size=config.intermediate_size,
             intermediate_act_fn=get_activation(config.hidden_act),
-            hidden_dropout_prob=config.hidden_dropout_prob,
-            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+            # hidden_dropout_prob=config.hidden_dropout_prob,
+            # attention_probs_dropout_prob=config.attention_probs_dropout_prob,
             initializer_range=config.initializer_range,
             do_return_all_layers=True,
             trainable=sent_bert_trainable)
@@ -541,10 +541,14 @@ def dropout(input_tensor, dropout_prob):
   Returns:
     A version of `input_tensor` with dropout applied.
   """
-  if dropout_prob is None or dropout_prob == 0.0:
-    return input_tensor
+  # if dropout_prob is None or dropout_prob == 0.0:
+  #   return input_tensor
+
+  # TODO: Annotate by TC
+  # output = tf.nn.dropout(input_tensor, rate=dropout_prob)
+  # TODO: Update by:TC
+  output = npu_ops.dropout(input_tensor, dropout_prob)
 
-  output = tf.nn.dropout(input_tensor, rate=dropout_prob)
   return output
 
 
@@ -554,10 +558,12 @@ def layer_norm(input_tensor, name=None):
       inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name)
 
 
-def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
+def layer_norm_and_dropout(input_tensor, name=None):
   """Runs layer normalization followed by dropout."""
   output_tensor = layer_norm(input_tensor, name)
-  output_tensor = dropout(output_tensor, dropout_prob)
+  # TODO: 删除dropout
+  # output_tensor = dropout(output_tensor, dropout_prob)
+
   return output_tensor
 
 
@@ -628,7 +634,7 @@ def embedding_postprocessor(input_tensor,
                             position_embedding_name="position_embeddings",
                             initializer_range=0.02,
                             max_position_embeddings=512,
-                            dropout_prob=0.1,
+                            # dropout_prob=0.1,
                             trainable=True):
   """Performs various post-processing on a word embedding tensor.
 
@@ -715,7 +721,7 @@ def embedding_postprocessor(input_tensor,
                                        position_broadcast_shape)
       output += position_embeddings
 
-  output = layer_norm_and_dropout(output, dropout_prob)
+  output = layer_norm_and_dropout(output)
   return output
 
 
@@ -1052,7 +1058,9 @@ def attention_layer(from_tensor,
 
   # This is actually dropping out entire tokens to attend to, which might
   # seem a bit unusual, but is taken from the original Transformer paper.
-  attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
+
+  # TODO: 删除dropout
+  # attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
 
   # `context_layer` = [B, F, N, H]
   context_layer = tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_layer)
@@ -1068,7 +1076,7 @@ def transformer_model(input_tensor,
                       intermediate_size=3072,
                       intermediate_act_fn=gelu,
                       hidden_dropout_prob=0.1,
-                      attention_probs_dropout_prob=0.1,
+                      # attention_probs_dropout_prob=0.1,
                       initializer_range=0.02,
                       do_return_all_layers=False,
                       trainable=True):
@@ -1139,7 +1147,7 @@ def transformer_model(input_tensor,
               attention_mask=attention_mask,
               num_attention_heads=num_attention_heads,
               size_per_head=attention_head_size,
-              attention_probs_dropout_prob=attention_probs_dropout_prob,
+              # attention_probs_dropout_prob=attention_probs_dropout_prob,
               initializer_range=initializer_range,
               trainable=trainable)
 
@@ -1155,7 +1163,10 @@ def transformer_model(input_tensor,
               None,
               "dense",
               trainable=trainable)
-          attention_output = dropout(attention_output, hidden_dropout_prob)
+
+          # TODO： 删除dropout
+          # attention_output = dropout(attention_output, hidden_dropout_prob)
+
           # Implementation of residual connections.
           attention_output = layer_norm(
               input_tensor=attention_output + layer_input)
@@ -1179,7 +1190,9 @@ def transformer_model(input_tensor,
             None,
             "dense",
             trainable=trainable)
-        layer_output = dropout(layer_output, hidden_dropout_prob)
+        # TODO： 删除dropout
+        # layer_output = dropout(layer_output, hidden_dropout_prob)
+
         layer_output = layer_norm(
             input_tensor=layer_output + attention_output)
         prev_output = layer_output
@@ -1283,4 +1296,3 @@ def assert_rank(tensor, expected_rank, name=None):
         "For the tensor `%s` in scope `%s`, the actual rank "
         "`%d` (shape = %s) is not equal to the expected rank `%s`" %
         (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
-
diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py
deleted file mode 100644
index 7f1254935..000000000
--- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py
+++ /dev/null
@@ -1,570 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Model layers in dual encoder SMITH model."""
-from npu_bridge.npu_init import *
-from six.moves import range
-from npu_bridge.estimator.npu import npu_convert_dropout
-import tensorflow.compat.v1 as tf
-
-from smith import constants
-from smith.bert import modeling
-
-
-def get_doc_rep_with_masked_sent(input_sent_reps_doc,
-                                 sent_mask_embedding,
-                                 input_mask_doc_level,
-                                 batch_size_static=32,
-                                 max_masked_sent_per_doc=2,
-                                 loop_sent_number_per_doc=32):
-  """Get the document representations with masked sentences.
-
-  Args:
-      input_sent_reps_doc: float Tensor. The independent sentence embeddings
-        without masks for the sentences in the current document. The shape is
-        [batch, loop_sent_number_per_doc, hidden].
-      sent_mask_embedding: float Tensor. The sentence embedding vector for the
-        masked position. The shape is [hidden].
-      input_mask_doc_level: int Tensor. The input masks on the document level to
-        identify whether a location is a real sentence (mask = 1) or a padded
-        sentence (mask = 0). The shape is [batch, loop_sent_number_per_doc].
-      batch_size_static: scalar. The static batch size depending on the training
-        or the evaluation mode.
-      max_masked_sent_per_doc: scalar. The maximum number of masked sentences
-        per document.
-      loop_sent_number_per_doc: scalar. The number of looped sentences per
-        document.
-
-  Returns:
-    The document representations with masked sentences and the positions/
-    weights for each masked sentences. This masked sentence weight is 1 for the
-    sampled real sentence position and 0 for the padded sentence position.
-  """
-  # We at least mask two sentences to build a candidate sentence pool for
-  # negative sentence sampling. We generate the masked_sent_index and
-  # masked_sent_weight for each document. Note that we do not add any word
-  # or sentence level masks during prediction or inference stage.
-  max_masked_sent_per_doc = max(max_masked_sent_per_doc, 2)
-  input_sent_reps_doc_list = tf.unstack(
-      input_sent_reps_doc, num=batch_size_static)
-  real_sent_number_per_doc = tf.unstack(
-      tf.reduce_sum(input_mask_doc_level, 1), num=batch_size_static)
-  masked_sent_index_list = []
-  masked_sent_weight_list = []
-
-  # For each example in the current batch, we randomly sample
-  # max_masked_sent_per_doc positions to mask the sentences. For each masked
-  # sentence position, the sentence in the current position is the positive
-  # example. The other co-masked sentences are the negative examples.
-  # The sampled sentence indexes will not be duplicated.
-  for batch_i in range(0, batch_size_static):
-    # Since everything in TPU must have a fixed shape, here the max sampled
-    # sentence index can be as large as loop_sent_number_per_doc. We will
-    # generate the corresponding sentence LM weights to reduce the impact
-    # on the final masked sentence LM loss following a similar way with the
-    # handling of masked word LM loss and masked word LM weights.
-    real_sent_number = real_sent_number_per_doc[batch_i]
-    sampled_sent_index = tf.slice(
-        tf.random_shuffle(tf.range(loop_sent_number_per_doc)), [0],
-        [max_masked_sent_per_doc])
-    sampled_sent_index = tf.sort(sampled_sent_index)
-    masked_sent_index_list.append(sampled_sent_index)
-    # Generates the corresponding sampled_sent_weight
-    sample_sent_weight = tf.cast(
-        tf.less(sampled_sent_index, real_sent_number), tf.float32)
-    masked_sent_weight_list.append(sample_sent_weight)
-
-    indices = tf.reshape(sampled_sent_index, [max_masked_sent_per_doc, -1])
-    # Duplicates sent_mask_embedding for each masked position.
-    updates = tf.reshape(
-        tf.tile(
-            sent_mask_embedding,
-            [max_masked_sent_per_doc],
-        ), [max_masked_sent_per_doc, -1])
-    input_sent_reps_doc_list[batch_i] = tf.tensor_scatter_update(
-        input_sent_reps_doc_list[batch_i], indices, updates)
-  # Here masked_sent_index_list is a list a tensors, where each tensor stores
-  # the masked sentence positions for each document in the current batch. The
-  # shape of masked_sent_index_list is [batch, max_masked_sent_per_doc].
-  # Here masked_sent_weight_list is a list a tensors, where each tensor stores
-  # the masked sentence weights for each document in the current batch. The
-  # shape of masked_sent_weight_list is [batch, max_masked_sent_per_doc].
-  return (tf.stack(input_sent_reps_doc_list), tf.stack(masked_sent_index_list),
-          tf.stack(masked_sent_weight_list))
-
-
-def get_masked_sent_lm_output(bert_config,
-                              input_tensor,
-                              cur_sent_reps_doc_unmask,
-                              sent_masked_positions,
-                              sent_masked_weights,
-                              debugging=False):
-  """Get the sentence level masked LM loss.
-
-  Args:
-      bert_config: BertConfig object. The configuration file for the document
-        level BERT model.
-      input_tensor: float Tensor. The contextualized representations of all
-        sentences learned by the document level BERT model. The shape is [batch,
-        loop_sent_number_per_doc, hidden]. This is the model prediction.
-      cur_sent_reps_doc_unmask: float Tensor. The unmasked sentence
-        representations of the current document. The shape is [batch,
-        loop_sent_number_per_doc, hidden]. This is the source of the ground
-        truth and negative examples in the masked sentence prediction.
-      sent_masked_positions: int Tensor. The masked sentence positions in the
-        current document. The shape is [batch, max_masked_sent_per_doc].
-      sent_masked_weights: float Tensor. The masked sentence weights in the
-        current document. The shape is [batch, max_masked_sent_per_doc].
-      debugging: bool. Whether it is in the debugging mode.
-
-  Returns:
-    The masked sentence LM loss and the mask sentence LM loss per example.
-
-  """
-  # The current method for masked sentence prediction: we approach this problem
-  # as a multi-class classification problem similar to the masked word LM task.
-  # For each masked sentence position, the sentence in the current position is
-  # the positive example. The other co-masked sentences in the current document
-  # and in the other documents of the same batch are the negative examples. We
-  # compute the cross entropy loss over the sentence prediction task following
-  # the implementation of the masked word LM loss in the BERT model.
-
-  input_tensor_shape = modeling.get_shape_list(input_tensor)
-  batch_size = input_tensor_shape[0]
-  masked_position_shape = modeling.get_shape_list(sent_masked_positions)
-  max_predictions_per_seq = masked_position_shape[1]
-
-  # In the context of masked sentence prediction, the max_predictions_per_seq
-  # is the same with max_masked_sent_per_doc.
-  # Output Shape: [batch * max_predictions_per_seq, hidden].
-  # Input_tensor is the model prediction for each position.
-  input_tensor = gather_indexes(input_tensor, sent_masked_positions)
-  # Independent_sent_embeddings is the ground truth input sentence embeddings
-  # for the document level BERT model. The output shape is [batch *
-  # max_predictions_per_seq, hidden].
-  independent_sent_embeddings = gather_indexes(cur_sent_reps_doc_unmask,
-                                               sent_masked_positions)
-
-  with tf.variable_scope("cls/sent_predictions", reuse=tf.AUTO_REUSE):
-    # We apply one more non-linear transformation before the output layer.
-    # This matrix is not used after pre-training.
-    with tf.variable_scope("transform"):
-      input_tensor = tf.layers.dense(
-          input_tensor,
-          units=bert_config.hidden_size,
-          activation=modeling.get_activation(bert_config.hidden_act),
-          kernel_initializer=modeling.create_initializer(
-              bert_config.initializer_range))
-      # Output Shape: [batch * max_predictions_per_seq, hidden].
-      input_tensor = modeling.layer_norm(input_tensor)
-
-    # The output weights are the same as the input embeddings, but there is
-    # an output-only bias for each predicted position.
-    output_bias = tf.get_variable(
-        "output_bias",
-        shape=[batch_size * max_predictions_per_seq],
-        initializer=tf.zeros_initializer())
-    # Shape of input_tensor [batch * max_predictions_per_seq, hidden].
-    # Shape of independent_sent_embeddings is [batch * max_predictions_per_seq,
-    # hidden].
-    # Shape of logits: [batch * max_predictions_per_seq,
-    # batch * max_predictions_per_seq].
-    logits = tf.matmul(
-        input_tensor, independent_sent_embeddings, transpose_b=True)
-    logits = tf.nn.bias_add(logits, output_bias)
-    # Output Shape: [batch * max_predictions_per_seq,
-    # batch * max_predictions_per_seq].
-    log_probs = tf.nn.log_softmax(logits, axis=-1)
-
-    # Output Shape: [batch * max_predictions_per_seq].
-    # Double checked the setting of label_ids here. The label_ids
-    # should be the label index in the "sentence vocabulary". Thus if batch=32,
-    # max_predictions_per_seq = 2, then label ids should be like
-    # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ..., 63]. For the ground truth one hot
-    # label matrix, only the values in the diagonal positions are 1. All the
-    # other positions should be 0.
-    label_ids = tf.range(
-        0, batch_size * max_predictions_per_seq, dtype=tf.int32)
-    if debugging:
-      label_ids = tf.Print(
-          label_ids, [label_ids],
-          message="label_ids in get_masked_sent_lm_output",
-          summarize=30)
-    # Output Shape: [batch * max_predictions_per_seq].
-    # The label_weights is the flatten vector based on sent_masked_weights,
-    # where the weight is 1.0 for sampled real sentences and 0.0 for sampled
-    # masked sentences.
-    label_weights = tf.reshape(sent_masked_weights, [-1])
-
-    # Output Shape: [batch * max_predictions_per_seq,
-    # batch * max_predictions_per_seq].
-    one_hot_labels = tf.one_hot(
-        label_ids, depth=batch_size * max_predictions_per_seq, dtype=tf.float32)
-
-    # Output Shape: [batch * max_predictions_per_seq].
-    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
-    # Output Shape: [1].
-    numerator = tf.reduce_sum(label_weights * per_example_loss)
-    # Output Shape: [1].
-    denominator = tf.reduce_sum(label_weights) + 1e-5
-    # Output Shape: [1].
-    loss = numerator / denominator
-    # Shape of loss [1].
-    # Shape of per_example_loss is [batch * max_predictions_per_seq].
-  return (loss, per_example_loss, log_probs)
-
-
-def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
-                         label_ids, label_weights):
-  """Get loss and log probs for the masked LM."""
-  # Output Shape: [batch * max_predictions_per_seq, hidden].
-  input_tensor = gather_indexes(input_tensor, positions)
-
-  with tf.variable_scope("cls/word_predictions", reuse=tf.AUTO_REUSE):
-    # We apply one more non-linear transformation before the output layer.
-    # This matrix is not used after pre-training.
-    with tf.variable_scope("transform"):
-      input_tensor = tf.layers.dense(
-          input_tensor,
-          units=bert_config.hidden_size,
-          activation=modeling.get_activation(bert_config.hidden_act),
-          kernel_initializer=modeling.create_initializer(
-              bert_config.initializer_range))
-      # Output Shape: [batch * max_predictions_per_seq, hidden].
-      input_tensor = modeling.layer_norm(input_tensor)
-
-    # The output weights are the same as the input embeddings, but there is
-    # an output-only bias for each token.
-    output_bias = tf.get_variable(
-        "output_bias",
-        shape=[bert_config.vocab_size],
-        initializer=tf.zeros_initializer())
-    # Shape of input_tensor [batch * max_predictions_per_seq, embedding_size].
-    # Shape of output_weights (embed table) is [vocab_size, embedding_size].
-    # In the current Bert implementation: embedding_size = hidden.
-    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
-    logits = tf.nn.bias_add(logits, output_bias)
-    # Output Shape: [batch * max_predictions_per_seq, vocab_size].
-    log_probs = tf.nn.log_softmax(logits, axis=-1)
-
-    # Output Shape: [batch * max_predictions_per_seq].
-    label_ids = tf.reshape(label_ids, [-1])
-    # Output Shape: [batch * max_predictions_per_seq].
-    label_weights = tf.reshape(label_weights, [-1])
-
-    # Output Shape: [batch * max_predictions_per_seq, vocab_size].
-    one_hot_labels = tf.one_hot(
-        label_ids, depth=bert_config.vocab_size, dtype=tf.float32)
-
-    # The `positions` tensor might be zero-padded (if the sequence is too
-    # short to have the maximum number of predictions). The `label_weights`
-    # tensor has a value of 1.0 for every real prediction and 0.0 for the
-    # padding predictions.
-    # Output Shape: [batch * max_predictions_per_seq].
-    per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1])
-    # Output Shape: [1].
-    numerator = tf.reduce_sum(label_weights * per_example_loss)
-    # Output Shape: [1].
-    denominator = tf.reduce_sum(label_weights) + 1e-5
-    # Output Shape: [1].
-    loss = numerator / denominator
-    # Shape of loss [1].
-    # Shape of per_example_loss is [batch * max_predictions_per_seq].
-  return (loss, per_example_loss, log_probs)
-
-
-def gather_indexes(sequence_tensor, positions):
-  """Gathers the vectors at the specific positions over a minibatch."""
-  # Shape of positions: [batch, max_mask_per_seq].
-  sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
-  batch_size = sequence_shape[0]
-  seq_length = sequence_shape[1]
-  width = sequence_shape[2]
-
-  # Shape of flat_offsets: [batch, 1].
-  flat_offsets = tf.reshape(
-      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
-  flat_positions = tf.reshape(positions + flat_offsets, [-1])
-  flat_sequence_tensor = tf.reshape(sequence_tensor,
-                                    [batch_size * seq_length, width])
-  output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
-  # The shape of output_tensor [batch * max_mask_per_seq, hidden].
-  return output_tensor
-
-
-def get_attention_weighted_sum(input_tensor, bert_config, is_training,
-                               attention_size):
-  """Compute the attentive weighted sum of an input tensor.
-
-  Args:
-      input_tensor: The input tensor for attentive representation. The shape of
-        input tensor is [batch, seq_length, hidden].
-      bert_config: The model config file.
-      is_training: If true, it is in training mode.
-      attention_size: int. Dimension of contextual vector.
-
-  Returns:
-      The attentive representation of the input tensor. The shape of the output
-        tensor is [batch, hidden].
-  """
-  with tf.variable_scope("combine_reps_attention", reuse=tf.AUTO_REUSE):
-    context_vector = tf.get_variable(
-        name="context_vector",
-        shape=[attention_size],
-        dtype=tf.float32)
-    # Output Shape: [batch, seq_length, attention_size].
-    projection = tf.layers.dense(
-        input_tensor,
-        attention_size,
-        activation=tf.tanh,
-        kernel_initializer=modeling.create_initializer(
-            bert_config.initializer_range))
-    # Output Shape: [batch, seq_length, 1].
-    attention = tf.reduce_sum(
-        tf.multiply(projection, context_vector), axis=2, keep_dims=True)
-    # Output Shape: [batch, seq_length, 1].
-    attention = tf.nn.softmax(attention, axis=1)
-    # Output Shape: [batch, hidden].
-    last_outputs = tf.reduce_sum(tf.multiply(input_tensor, attention), axis=1)
-    if is_training:
-      last_outputs = tf.layers.dropout(
-          last_outputs, bert_config.attention_probs_dropout_prob, training=True)
-  return last_outputs
-
-
-def get_seq_rep_from_bert(bert_model):
-  """Get the sequence represenation given a BERT encoder."""
-  siamese_input_tensor = bert_model.get_pooled_output()
-  hidden_size = siamese_input_tensor.shape[-1].value
-  siamese_input_tensor = tf.layers.dense(
-      siamese_input_tensor, units=hidden_size, activation=tf.nn.relu)
-  normalized_siamese_input_tensor = tf.nn.l2_normalize(
-      siamese_input_tensor, axis=1)
-  return normalized_siamese_input_tensor
-
-
-def get_sent_reps_masks_normal_loop(sent_index,
-                                    input_sent_reps_doc,
-                                    input_mask_doc_level,
-                                    masked_lm_loss_doc,
-                                    masked_lm_example_loss_doc,
-                                    masked_lm_weights_doc,
-                                    dual_encoder_config,
-                                    is_training,
-                                    train_mode,
-                                    input_ids,
-                                    input_mask,
-                                    masked_lm_positions,
-                                    masked_lm_ids,
-                                    masked_lm_weights,
-                                    use_one_hot_embeddings,
-                                    debugging=False):
-  """Get the sentence encodings, mask ids and masked word LM loss.
-
-  Args:
-      sent_index: The index of the current looped sentence.
-      input_sent_reps_doc: The representations of all sentences in the doc
-        learned by BERT.
-      input_mask_doc_level: The document level input masks, which indicates
-        whether a sentence is a real sentence or a padded sentence.
-      masked_lm_loss_doc: The sum of all the masked word LM loss.
-      masked_lm_example_loss_doc: The per example masked word LM loss.
-      masked_lm_weights_doc: the weights of the maksed LM words. If the position
-        is corresponding to a real masked word, it is 1.0; It is a padded mask,
-        the weight is 0.
-      dual_encoder_config: The config of the dual encoder.
-      is_training: Whether it is in the training mode.
-      train_mode: string. The train mode which can be finetune, joint_train, or
-        pretrain.
-      input_ids: The ids of the input tokens.
-      input_mask: The mask of the input tokens.
-      masked_lm_positions: The positions of the masked words in the language
-        model training.
-      masked_lm_ids: The ids of the masked words in LM model training.
-      masked_lm_weights: The weights of the masked words in LM model training.
-      use_one_hot_embeddings: Whether use one hot embedding. It should be true
-        for the runs on TPUs.
-      debugging: bool. Whether it is in the debugging mode.
-
-  Returns:
-    A list of tensors on the learned sentence representations and the masked
-    word LM loss.
-  """
-  # Collect token information for the current sentence.
-  bert_config = modeling.BertConfig.from_json_file(
-      dual_encoder_config.encoder_config.bert_config_file)
-  max_sent_length_by_word = dual_encoder_config.encoder_config.max_sent_length_by_word
-  sent_bert_trainable = dual_encoder_config.encoder_config.sent_bert_trainable
-  max_predictions_per_seq = dual_encoder_config.encoder_config.max_predictions_per_seq
-  sent_start = sent_index * max_sent_length_by_word
-  input_ids_cur_sent = tf.slice(input_ids, [0, sent_start],
-                                [-1, max_sent_length_by_word])
-  # Output shape: [batch, max_sent_length_by_word].
-  input_mask_cur_sent = tf.slice(input_mask, [0, sent_start],
-                                 [-1, max_sent_length_by_word])
-  # Output Shape:  [batch].
-  input_mask_cur_sent_max = tf.reduce_max(input_mask_cur_sent, 1)
-  # Output Shape:  [loop_sent_number_per_doc, batch].
-  input_mask_doc_level.append(input_mask_cur_sent_max)
-  if debugging:
-    input_ids_cur_sent = tf.Print(
-        input_ids_cur_sent, [input_ids_cur_sent, input_mask_cur_sent],
-        message="input_ids_cur_sent in get_sent_reps_masks_lm_loss",
-        summarize=20)
-  model = modeling.BertModel(
-      config=bert_config,
-      is_training=is_training,
-      input_ids=input_ids_cur_sent,
-      input_mask=input_mask_cur_sent,
-      use_one_hot_embeddings=use_one_hot_embeddings,
-      sent_bert_trainable=sent_bert_trainable)
-  with tf.variable_scope("seq_rep_from_bert_sent_dense", reuse=tf.AUTO_REUSE):
-    normalized_siamese_input_tensor = get_seq_rep_from_bert(model)
-  input_sent_reps_doc.append(normalized_siamese_input_tensor)
-
-  if (train_mode == constants.TRAIN_MODE_PRETRAIN or
-      train_mode == constants.TRAIN_MODE_JOINT_TRAIN):
-    # Collect masked token information for the current sentence.
-    sent_mask_lm_token_start = sent_index * max_predictions_per_seq
-    # Output shape: [batch, max_predictions_per_seq].
-    masked_lm_positions_cur_sent = tf.slice(masked_lm_positions,
-                                            [0, sent_mask_lm_token_start],
-                                            [-1, max_predictions_per_seq])
-    masked_lm_ids_cur_sent = tf.slice(masked_lm_ids,
-                                      [0, sent_mask_lm_token_start],
-                                      [-1, max_predictions_per_seq])
-    masked_lm_weights_cur_sent = tf.slice(masked_lm_weights,
-                                          [0, sent_mask_lm_token_start],
-                                          [-1, max_predictions_per_seq])
-    # Since in the processed data of smith model, the masked lm positions are
-    # global indices started from the 1st token of the whole sequence, we need
-    # to transform this global position to a local position for the current
-    # sentence. The position index is started from 0.
-    # Local_index = global_index mod max_sent_length_by_word.
-    masked_lm_positions_cur_sent = tf.mod(masked_lm_positions_cur_sent,
-                                          max_sent_length_by_word)
-    # Shape of masked_lm_loss_cur_sent [1].
-    # Shape of masked_lm_example_loss_cur_sent is [batch,
-    # max_predictions_per_seq].
-    (masked_lm_loss_cur_sent, masked_lm_example_loss_cur_sent,
-     _) = get_masked_lm_output(bert_config, model.get_sequence_output(),
-                               model.get_embedding_table(),
-                               masked_lm_positions_cur_sent,
-                               masked_lm_ids_cur_sent,
-                               masked_lm_weights_cur_sent)
-    # Output Shape: [1].
-    masked_lm_loss_doc += masked_lm_loss_cur_sent
-    # Output Shape: [loop_sent_number_per_doc, batch * max_predictions_per_seq].
-    masked_lm_example_loss_doc.append(masked_lm_example_loss_cur_sent)
-    # Output Shape: [loop_sent_number_per_doc, batch, max_predictions_per_seq].
-    masked_lm_weights_doc.append(masked_lm_weights_cur_sent)
-  return (input_sent_reps_doc, input_mask_doc_level, masked_lm_loss_doc,
-          masked_lm_example_loss_doc, masked_lm_weights_doc)
-
-
-def learn_sent_reps_normal_loop(dual_encoder_config, is_training, train_mode,
-                                input_ids_1, input_mask_1,
-                                masked_lm_positions_1, masked_lm_ids_1,
-                                masked_lm_weights_1, input_ids_2, input_mask_2,
-                                masked_lm_positions_2, masked_lm_ids_2,
-                                masked_lm_weights_2, use_one_hot_embeddings):
-  """Learn the sentence representations with normal loop functions."""
-  input_sent_reps_doc_1 = []
-  # Generate document level input masks on each sentence based on the word
-  # level input mask information.
-  input_mask_doc_level_1 = []
-  masked_lm_loss_doc_1 = 0.0
-  masked_lm_example_loss_doc_1 = []
-  masked_lm_weights_doc_1 = []
-
-  input_mask_doc_level_2 = []
-  input_sent_reps_doc_2 = []
-  masked_lm_loss_doc_2 = 0.0
-  masked_lm_example_loss_doc_2 = []
-  masked_lm_weights_doc_2 = []
-
-  # Learn the representation for each sentence in the document.
-  # Setting smaller number of loop_sent_number_per_doc can save memory for the
-  # model training.
-  # Shape of masked_lm_loss_doc_1 [1].
-  # Shape of masked_lm_example_loss_doc_1 is [max_doc_length_by_sentence,
-  # batch * max_predictions_per_seq].
-  for sent_index in range(
-      0, dual_encoder_config.encoder_config.loop_sent_number_per_doc):
-    (input_sent_reps_doc_1, input_mask_doc_level_1, masked_lm_loss_doc_1,
-     masked_lm_example_loss_doc_1,
-     masked_lm_weights_doc_1) = get_sent_reps_masks_normal_loop(
-         sent_index, input_sent_reps_doc_1, input_mask_doc_level_1,
-         masked_lm_loss_doc_1, masked_lm_example_loss_doc_1,
-         masked_lm_weights_doc_1, dual_encoder_config, is_training, train_mode,
-         input_ids_1, input_mask_1, masked_lm_positions_1, masked_lm_ids_1,
-         masked_lm_weights_1, use_one_hot_embeddings)
-    (input_sent_reps_doc_2, input_mask_doc_level_2, masked_lm_loss_doc_2,
-     masked_lm_example_loss_doc_2,
-     masked_lm_weights_doc_2) = get_sent_reps_masks_normal_loop(
-         sent_index, input_sent_reps_doc_2, input_mask_doc_level_2,
-         masked_lm_loss_doc_2, masked_lm_example_loss_doc_2,
-         masked_lm_weights_doc_2, dual_encoder_config, is_training, train_mode,
-         input_ids_2, input_mask_2, masked_lm_positions_2, masked_lm_ids_2,
-         masked_lm_weights_2, use_one_hot_embeddings)
-
-  # Stack the sentence representations to learn the doc representations.
-  # Output Shape: [batch, loop_sent_number_per_doc, hidden].
-  input_sent_reps_doc_1_unmask = tf.stack(input_sent_reps_doc_1, axis=1)
-  input_sent_reps_doc_2_unmask = tf.stack(input_sent_reps_doc_2, axis=1)
-
-  # Output Shape:  [batch, loop_sent_number_per_doc].
-  input_mask_doc_level_1_tensor = tf.stack(input_mask_doc_level_1, axis=1)
-  input_mask_doc_level_2_tensor = tf.stack(input_mask_doc_level_2, axis=1)
-
-  if (train_mode == constants.TRAIN_MODE_PRETRAIN or
-      train_mode == constants.TRAIN_MODE_JOINT_TRAIN):
-    # Output Shape:  [batch * max_predictions_per_seq,
-    # loop_sent_number_per_doc].
-    masked_lm_example_loss_doc_1 = tf.stack(
-        masked_lm_example_loss_doc_1, axis=1)
-    masked_lm_example_loss_doc_2 = tf.stack(
-        masked_lm_example_loss_doc_2, axis=1)
-
-    # Output Shape:  [batch, loop_sent_number_per_doc, max_predictions_per_seq].
-    masked_lm_weights_doc_1 = tf.stack(masked_lm_weights_doc_1, axis=1)
-    masked_lm_weights_doc_2 = tf.stack(masked_lm_weights_doc_2, axis=1)
-  else:
-    masked_lm_example_loss_doc_1 = tf.zeros([1])
-    masked_lm_example_loss_doc_2 = tf.zeros([1])
-    masked_lm_weights_doc_1 = tf.zeros([1])
-    masked_lm_weights_doc_2 = tf.zeros([1])
-
-  return (input_sent_reps_doc_1_unmask, input_mask_doc_level_1_tensor,
-          input_sent_reps_doc_2_unmask, input_mask_doc_level_2_tensor,
-          masked_lm_loss_doc_1, masked_lm_loss_doc_2,
-          masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2,
-          masked_lm_weights_doc_1, masked_lm_weights_doc_2)
-
diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py
deleted file mode 100644
index a2373128f..000000000
--- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py
+++ /dev/null
@@ -1,491 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Dual encoder SMITH models."""
-from npu_bridge.npu_init import *
-
-import tensorflow.compat.v1 as tf
-
-from smith import constants
-from smith import layers
-from smith import loss_fns
-from smith import metric_fns
-from smith import utils
-from smith.bert import modeling
-from smith.bert import optimization
-
-# Add by:TC
-import precision_tool.tf_config as npu_tf_config
-
-def build_smith_dual_encoder(dual_encoder_config,
-                             train_mode,
-                             is_training,
-                             input_ids_1,
-                             input_mask_1,
-                             masked_lm_positions_1,
-                             masked_lm_ids_1,
-                             masked_lm_weights_1,
-                             input_ids_2,
-                             input_mask_2,
-                             masked_lm_positions_2,
-                             masked_lm_ids_2,
-                             masked_lm_weights_2,
-                             use_one_hot_embeddings,
-                             documents_match_labels,
-                             debugging=False):
-  """Build the dual encoder SMITH model.
-
-  Args:
-    dual_encoder_config: the configuration file for the dual encoder model.
-    train_mode: string. The train mode of the current. It can be finetune,
-      pretrain or joint_train.
-    is_training: bool. Whether it in training mode.
-    input_ids_1: int Tensor with shape [batch, max_seq_length]. The input ids of
-      input examples of text 1.
-    input_mask_1: int Tensor with shape [batch, max_seq_length]. The input masks
-      of input examples of text 1.
-    masked_lm_positions_1: int Tensor with shape [batch,
-      max_predictions_per_seq]. The input masked LM prediction positions of
-      input examples of text 1. This can be useful to compute the masked word
-      prediction LM loss.
-    masked_lm_ids_1: int Tensor with shape [batch, max_predictions_per_seq]. The
-      input masked LM prediction ids of input examples of text 1. It is the
-      ground truth in the masked word LM prediction task. This can be useful to
-      compute the masked word prediction LM loss.
-    masked_lm_weights_1: float Tensor with shape [batch,
-      max_predictions_per_seq]. The input masked LM prediction weights of input
-      examples of text 1.
-    input_ids_2: int Tensor with shape [batch, max_seq_length]. The input ids of
-      input examples of text 2.
-    input_mask_2: int Tensor with shape [batch, max_seq_length]. The input masks
-      of input examples of text 2.
-    masked_lm_positions_2: int Tensor with shape [batch,
-      max_predictions_per_seq]. The input masked LM prediction positions of
-      input examples of text 2. This can be useful to compute the masked word
-      prediction LM loss.
-    masked_lm_ids_2: int Tensor with shape [batch, max_predictions_per_seq]. The
-      input masked LM prediction ids of input examples of text 2. It is the
-      ground truth in the masked word LM prediction task. This can be useful to
-      compute the masked word prediction LM loss.
-    masked_lm_weights_2: float Tensor with shape [batch,
-      max_predictions_per_seq]. The input masked LM prediction weights of input
-      examples of text 2.
-    use_one_hot_embeddings: bool. Whether use one hot embeddings.
-    documents_match_labels: float Tensor with shape [batch]. The ground truth
-      labels for the input examples.
-    debugging: bool. Whether it is in the debugging mode.
-
-  Returns:
-    The masked LM loss, per example LM loss, masked sentence LM loss, per
-    example masked sentence LM loss, sequence representations, text matching
-    loss, per example text matching loss, text matching logits, text matching
-    probabilities and text matching log probabilities.
-
-  Raises:
-    ValueError: if the doc_rep_combine_mode in dual_encoder_config is invalid.
-  """
-  bert_config = modeling.BertConfig.from_json_file(
-      dual_encoder_config.encoder_config.bert_config_file)
-  doc_bert_config = modeling.BertConfig.from_json_file(
-      dual_encoder_config.encoder_config.doc_bert_config_file)
-  (input_sent_reps_doc_1_unmask, input_mask_doc_level_1_tensor,
-   input_sent_reps_doc_2_unmask, input_mask_doc_level_2_tensor,
-   masked_lm_loss_doc_1, masked_lm_loss_doc_2, masked_lm_example_loss_doc_1,
-   masked_lm_example_loss_doc_2, masked_lm_weights_doc_1,
-   masked_lm_weights_doc_2) = layers.learn_sent_reps_normal_loop(
-       dual_encoder_config, is_training, train_mode, input_ids_1, input_mask_1,
-       masked_lm_positions_1, masked_lm_ids_1, masked_lm_weights_1, input_ids_2,
-       input_mask_2, masked_lm_positions_2, masked_lm_ids_2,
-       masked_lm_weights_2, use_one_hot_embeddings)
-  if debugging:
-    input_mask_doc_level_1_tensor = tf.Print(
-        input_mask_doc_level_1_tensor,
-        [input_mask_doc_level_1_tensor, input_mask_doc_level_2_tensor],
-        message="input_mask_doc_level_1_tensor in build_smith_dual_encoder",
-        summarize=30)
-
-  if dual_encoder_config.encoder_config.use_masked_sentence_lm_loss:
-    batch_size_static = (
-        dual_encoder_config.train_eval_config.train_batch_size if is_training
-        else dual_encoder_config.train_eval_config.eval_batch_size)
-    # Generates the sentence masked document represenations.
-    with tf.variable_scope("mask_sent_in_doc", reuse=tf.AUTO_REUSE):
-      # Randomly initialize a masked sentence vector and reuse it.
-      # We also need to return the masked sentence position index to get the
-      # ground truth labels for the masked positions. The shape of
-      # sent_mask_embedding is [hidden].
-      sent_mask_embedding = tf.get_variable(
-          name="sentence_mask_embedding",
-          shape=[bert_config.hidden_size],
-          initializer=tf.truncated_normal_initializer(
-              stddev=bert_config.initializer_range))
-      # Output Shape: [batch, loop_sent_number_per_doc, hidden].
-      (input_sent_reps_doc_1_masked, masked_sent_index_1,
-       masked_sent_weight_1) = layers.get_doc_rep_with_masked_sent(
-           input_sent_reps_doc=input_sent_reps_doc_1_unmask,
-           sent_mask_embedding=sent_mask_embedding,
-           input_mask_doc_level=input_mask_doc_level_1_tensor,
-           batch_size_static=batch_size_static,
-           max_masked_sent_per_doc=dual_encoder_config.encoder_config
-           .max_masked_sent_per_doc,
-           loop_sent_number_per_doc=dual_encoder_config.encoder_config
-           .loop_sent_number_per_doc)
-      (input_sent_reps_doc_2_masked, masked_sent_index_2,
-       masked_sent_weight_2) = layers.get_doc_rep_with_masked_sent(
-           input_sent_reps_doc=input_sent_reps_doc_2_unmask,
-           sent_mask_embedding=sent_mask_embedding,
-           input_mask_doc_level=input_mask_doc_level_2_tensor,
-           batch_size_static=batch_size_static,
-           max_masked_sent_per_doc=dual_encoder_config.encoder_config
-           .max_masked_sent_per_doc,
-           loop_sent_number_per_doc=dual_encoder_config.encoder_config
-           .loop_sent_number_per_doc)
-    # Learn the document representations based on masked sentence embeddings.
-    # Note that the variables in the DocBert model are not within the
-    # "mask_sent_in_doc" variable scope.
-    model_doc_1 = modeling.DocBertModel(
-        config=doc_bert_config,
-        is_training=is_training,
-        input_reps=input_sent_reps_doc_1_masked,
-        input_mask=input_mask_doc_level_1_tensor)
-    model_doc_2 = modeling.DocBertModel(
-        config=doc_bert_config,
-        is_training=is_training,
-        input_reps=input_sent_reps_doc_2_masked,
-        input_mask=input_mask_doc_level_2_tensor)
-    # Shape of masked_sent_lm_loss_1 [1].
-    # Shape of masked_sent_lm_example_loss_1 is [batch *
-    # max_predictions_per_seq].
-    (masked_sent_lm_loss_1, masked_sent_per_example_loss_1,
-     _) = layers.get_masked_sent_lm_output(doc_bert_config,
-                                           model_doc_1.get_sequence_output(),
-                                           input_sent_reps_doc_1_unmask,
-                                           masked_sent_index_1,
-                                           masked_sent_weight_1)
-    (masked_sent_lm_loss_2, masked_sent_per_example_loss_2,
-     _) = layers.get_masked_sent_lm_output(doc_bert_config,
-                                           model_doc_2.get_sequence_output(),
-                                           input_sent_reps_doc_2_unmask,
-                                           masked_sent_index_2,
-                                           masked_sent_weight_2)
-  else:
-    # Learn the document representations based on unmasked sentence embeddings.
-    model_doc_1 = modeling.DocBertModel(
-        config=doc_bert_config,
-        is_training=is_training,
-        input_reps=input_sent_reps_doc_1_unmask,
-        input_mask=input_mask_doc_level_1_tensor)
-    model_doc_2 = modeling.DocBertModel(
-        config=doc_bert_config,
-        is_training=is_training,
-        input_reps=input_sent_reps_doc_2_unmask,
-        input_mask=input_mask_doc_level_2_tensor)
-    masked_sent_lm_loss_1 = 0
-    masked_sent_lm_loss_2 = 0
-    masked_sent_per_example_loss_1 = tf.zeros(1)
-    masked_sent_per_example_loss_2 = tf.zeros(1)
-    masked_sent_weight_1 = tf.zeros(1)
-    masked_sent_weight_2 = tf.zeros(1)
-
-  with tf.variable_scope("seq_rep_from_bert_doc_dense", reuse=tf.AUTO_REUSE):
-    normalized_doc_rep_1 = layers.get_seq_rep_from_bert(model_doc_1)
-    normalized_doc_rep_2 = layers.get_seq_rep_from_bert(model_doc_2)
-
-    # We also dump the contextualized sentence embedding output by document
-    # level Transformer model. These representations maybe useful for sentence
-    # level tasks.
-    output_sent_reps_doc_1 = model_doc_1.get_sequence_output()
-    output_sent_reps_doc_2 = model_doc_2.get_sequence_output()
-
-  # Here we support multiple modes to generate the final document
-  # representations based on the word/sentence/document level representations
-  # 1. normal: only use the document level representation as the final document
-  # representations.
-  # 2. sum_concat: firstly compute the sum of all sentence level repsentations.
-  # Then concatenate the sum vector with the document level representations.
-  # 3. mean_concat: firstly compute the mean of all sentence level
-  # repsentations. Then concatenate the mean vector with the document level
-  # representations.
-  # 4. attention: firstly compute the weighted sum of sentence level
-  # representations with attention mechanism, then concatenate the weighted sum
-  # vector with the document level representations.
-  # The document level mask is to indicate whether each sentence is
-  # a real sentence (1) or a paded sentence (0). The shape of
-  # input_mask_doc_level_1_tensor is [batch, max_doc_length_by_sentence]. The
-  # shape of input_sent_reps_doc_1_unmask is
-  # [batch, max_doc_length_by_sentence, hidden].
-  final_doc_rep_combine_mode = dual_encoder_config.encoder_config.doc_rep_combine_mode
-  if final_doc_rep_combine_mode == constants.DOC_COMBINE_NORMAL:
-    final_doc_rep_1 = normalized_doc_rep_1
-    final_doc_rep_2 = normalized_doc_rep_2
-  elif final_doc_rep_combine_mode == constants.DOC_COMBINE_SUM_CONCAT:
-    # Output Shape: [batch, 2*hidden].
-    final_doc_rep_1 = tf.concat(
-        [tf.reduce_sum(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1],
-        axis=1)
-    final_doc_rep_2 = tf.concat(
-        [tf.reduce_sum(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2],
-        axis=1)
-  elif final_doc_rep_combine_mode == constants.DOC_COMBINE_MEAN_CONCAT:
-    final_doc_rep_1 = tf.concat(
-        [tf.reduce_mean(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1],
-        axis=1)
-    final_doc_rep_2 = tf.concat(
-        [tf.reduce_mean(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2],
-        axis=1)
-  elif final_doc_rep_combine_mode == constants.DOC_COMBINE_ATTENTION:
-    final_doc_rep_1 = tf.concat([
-        layers.get_attention_weighted_sum(
-            input_sent_reps_doc_1_unmask, bert_config, is_training,
-            dual_encoder_config.encoder_config.doc_rep_combine_attention_size),
-        normalized_doc_rep_1
-    ],
-                                axis=1)
-    final_doc_rep_2 = tf.concat([
-        layers.get_attention_weighted_sum(
-            input_sent_reps_doc_2_unmask, bert_config, is_training,
-            dual_encoder_config.encoder_config.doc_rep_combine_attention_size),
-        normalized_doc_rep_2
-    ],
-                                axis=1)
-  else:
-    raise ValueError("Only normal, sum_concat, mean_concat and attention are"
-                     " supported: %s" % final_doc_rep_combine_mode)
-  (siamese_loss, siamese_example_loss,
-   siamese_logits) = loss_fns.get_prediction_loss_cosine(
-       input_tensor_1=final_doc_rep_1,
-       input_tensor_2=final_doc_rep_2,
-       labels=documents_match_labels,
-       similarity_score_amplifier=dual_encoder_config.loss_config
-       .similarity_score_amplifier,
-       neg_to_pos_example_ratio=dual_encoder_config.train_eval_config
-       .neg_to_pos_example_ratio)
-
-  # The shape of masked_lm_loss_doc is [1].
-  # The shape of masked_lm_example_loss_doc is [batch * max_predictions_per_seq,
-  # max_doc_length_by_sentence].
-  return (masked_lm_loss_doc_1, masked_lm_loss_doc_2,
-          masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2,
-          masked_lm_weights_doc_1, masked_lm_weights_doc_2,
-          masked_sent_lm_loss_1, masked_sent_lm_loss_2,
-          masked_sent_per_example_loss_1, masked_sent_per_example_loss_2,
-          masked_sent_weight_1, masked_sent_weight_2, final_doc_rep_1,
-          final_doc_rep_2, input_sent_reps_doc_1_unmask,
-          input_sent_reps_doc_2_unmask, output_sent_reps_doc_1,
-          output_sent_reps_doc_2, siamese_loss, siamese_example_loss,
-          siamese_logits)
-
-
-def model_fn_builder(dual_encoder_config,
-                     train_mode,
-                     learning_rate,
-                     num_train_steps,
-                     num_warmup_steps,
-                     use_tpu,
-                     use_one_hot_embeddings,
-                     debugging=False):
-  """Returns `model_fn` closure for TPUEstimator."""
-
-  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
-    """The `model_fn` for TPUEstimator."""
-    tf.logging.info("*** Current mode: %s ***" % mode)
-    tf.logging.info("*** Features ***")
-    for name in sorted(features.keys()):
-      tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape))
-
-    input_ids_1 = features["input_ids_1"]
-    input_mask_1 = features["input_mask_1"]
-    if train_mode == constants.TRAIN_MODE_FINETUNE:
-      masked_lm_positions_1 = tf.zeros([1])
-      masked_lm_ids_1 = tf.zeros([1])
-      masked_lm_weights_1 = tf.zeros([1])
-    else:
-      masked_lm_positions_1 = features["masked_lm_positions_1"]
-      masked_lm_ids_1 = features["masked_lm_ids_1"]
-      masked_lm_weights_1 = features["masked_lm_weights_1"]
-
-    input_ids_2 = features["input_ids_2"]
-    input_mask_2 = features["input_mask_2"]
-    if train_mode == constants.TRAIN_MODE_FINETUNE:
-      masked_lm_positions_2 = tf.zeros([1])
-      masked_lm_ids_2 = tf.zeros([1])
-      masked_lm_weights_2 = tf.zeros([1])
-    else:
-      masked_lm_positions_2 = features["masked_lm_positions_2"]
-      masked_lm_ids_2 = features["masked_lm_ids_2"]
-      masked_lm_weights_2 = features["masked_lm_weights_2"]
-    documents_match_labels = features["documents_match_labels"]
-    # Since the document_match_labels might contain labels like 0/1/2, we need
-    # to transfer these labels to binary labels like 0/1.
-    documents_match_labels = tf.cast(documents_match_labels > 0, tf.float32)
-    is_real_example = None
-    if "is_real_example" in features:
-      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
-    else:
-      is_real_example = tf.ones(
-          tf.shape(documents_match_labels), dtype=tf.float32)
-
-    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
-
-    if (dual_encoder_config.encoder_config.model_name ==
-        constants.MODEL_NAME_SMITH_DUAL_ENCODER):
-      # For the smith model, since the actual looped number of sentences per
-      # document maybe smaller than max_doc_length_by_sentence, we need to
-      # overwrite the lm weights with the actual lm weights returned by the
-      # function.
-      (masked_lm_loss_1, masked_lm_loss_2, masked_lm_example_loss_1,
-       masked_lm_example_loss_2, masked_lm_weights_1, masked_lm_weights_2,
-       masked_sent_lm_loss_1, masked_sent_lm_loss_2,
-       masked_sent_per_example_loss_1, masked_sent_per_example_loss_2,
-       masked_sent_weight_1, masked_sent_weight_2, seq_embed_1, seq_embed_2,
-       input_sent_embed_1, input_sent_embed_2, output_sent_embed_1,
-       output_sent_embed_2, siamese_loss,
-       siamese_example_loss, siamese_logits) = build_smith_dual_encoder(
-           dual_encoder_config, train_mode, is_training, input_ids_1,
-           input_mask_1, masked_lm_positions_1, masked_lm_ids_1,
-           masked_lm_weights_1, input_ids_2, input_mask_2,
-           masked_lm_positions_2, masked_lm_ids_2, masked_lm_weights_2,
-           use_one_hot_embeddings, documents_match_labels, debugging)
-    else:
-      raise ValueError(
-          "Only smith_dual_encoder is supported: %s" %
-          dual_encoder_config.encoder_config.model_name)
-
-    # There are three different modes for training in the smith model.
-    # 1. joint_train: a multi-task learning setting which combines the masked
-    # word LM losses for doc1/doc2 and the siamese matching loss. If we add the
-    # masked sentence LM task, we also add the masked sentence LM losses for
-    # the two documents.
-    # 2. pretrain: only contains the masked word LM losses for doc1/doc2. We
-    # currently didn't include the NSP loss since NSP loss is not very useful
-    # according to the XLNet/ RoBERTa/ ALBERT paper. If we add the masked
-    # sentence LM task, we also add the masked sentence LM losses for the
-    # two documents.
-    # 3. finetune: fine tune the model with loaded pretrained checkpoint only
-    # with the siamese matching loss. If we add the masked sentence LM task,
-    # we also add the masked sentence LM losses for the two documents.
-    if train_mode == constants.TRAIN_MODE_JOINT_TRAIN:
-      total_loss = masked_lm_loss_1 + masked_lm_loss_2 + siamese_loss
-    elif train_mode == constants.TRAIN_MODE_PRETRAIN:
-      total_loss = masked_lm_loss_1 + masked_lm_loss_2
-    elif train_mode == constants.TRAIN_MODE_FINETUNE:
-      total_loss = siamese_loss
-    else:
-      raise ValueError("Only joint_train, pretrain, finetune are supported.")
-    # If we add the masked sentence LM task, we also add the masked sentence
-    # LM losses for the two documents.
-    if dual_encoder_config.encoder_config.use_masked_sentence_lm_loss:
-      total_loss += (masked_sent_lm_loss_1 + masked_sent_lm_loss_2)
-
-    total_loss = tf.identity(total_loss, name='total_loss')
-    
-    tvars = tf.trainable_variables()
-    initialized_variable_names = {}
-    scaffold_fn = None
-    init_checkpoint = dual_encoder_config.encoder_config.init_checkpoint
-    # Load pretrained BERT checkpoints if there is a specified path.
-    if init_checkpoint:
-      tf.logging.info("**** Passed pretrained BERT checkpoint = %s ****",
-                      init_checkpoint)
-      (assignment_map, initialized_variable_names
-      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
-      if use_tpu:
-
-        def tpu_scaffold():
-          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-          return tf.train.Scaffold()
-
-        scaffold_fn = tpu_scaffold
-      else:
-        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-
-    tf.logging.info("**** Trainable Variables ****")
-    for var in tvars:
-      init_string = ", *INIT_RANDOMLY*"
-      if var.name in initialized_variable_names:
-        init_string = ", *INIT_FROM_CKPT*"
-      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
-                      init_string)
-    output_spec = None
-    predicted_score = tf.sigmoid(siamese_logits)
-    predicted_class = tf.round(predicted_score)
-
-    if dual_encoder_config.encoder_config.model_name == constants.MODEL_NAME_SMITH_DUAL_ENCODER:
-      _, prediction_dict = utils.get_export_outputs_prediction_dict_smith_de(
-          seq_embed_1, seq_embed_2, predicted_score, predicted_class,
-          documents_match_labels, input_sent_embed_1, input_sent_embed_2,
-          output_sent_embed_1, output_sent_embed_2)
-    else:
-      raise ValueError("Unsupported model: %s" % dual_encoder_config.encoder_config.model_name)
-
-    if mode == tf.estimator.ModeKeys.TRAIN:
-      train_op = optimization.create_optimizer(total_loss, learning_rate,
-                                               num_train_steps,
-                                               num_warmup_steps, use_tpu)
-      # Add by:TC 20220705
-      output_spec = tf.estimator.EstimatorSpec(
-                    mode=mode, 
-                    loss=total_loss, 
-                    train_op=train_op, 
-                    training_hooks=[npu_tf_config.estimator_dump()])
-
-    elif mode == tf.estimator.ModeKeys.EVAL:
-      if (train_mode == constants.TRAIN_MODE_JOINT_TRAIN or
-          train_mode == constants.TRAIN_MODE_PRETRAIN):
-        eval_metrics = (metric_fns.metric_fn_pretrain, [
-            masked_lm_example_loss_1, masked_lm_weights_1,
-            masked_sent_per_example_loss_1, masked_sent_weight_1,
-            masked_lm_example_loss_2, masked_lm_weights_2,
-            masked_sent_per_example_loss_2, masked_sent_weight_2,
-            predicted_class, documents_match_labels, is_real_example
-        ])
-      elif train_mode == constants.TRAIN_MODE_FINETUNE:
-        eval_metrics = (metric_fns.metric_fn_finetune, [
-            predicted_class, documents_match_labels, siamese_example_loss,
-            is_real_example
-        ])
-      else:
-        raise ValueError("Only joint_train, pretrain, finetune are supported.")
-      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
-          mode=mode,
-          loss=total_loss,
-          eval_metrics=eval_metrics,
-          scaffold_fn=scaffold_fn)
-
-    elif mode == tf.estimator.ModeKeys.PREDICT:
-      output_spec = tf.estimator.tpu.TPUEstimatorSpec(
-          mode=mode, predictions=prediction_dict, scaffold_fn=scaffold_fn)
-    else:
-      raise ValueError("Only TRAIN, EVAL, PREDICT modes are supported: %s" % mode)
-
-    return output_spec
-
-  return model_fn
-
diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py
deleted file mode 100644
index 2818aabc8..000000000
--- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py
+++ /dev/null
@@ -1,547 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Library to preprocess text data into SMITH dual encoder model inputs."""
-from npu_bridge.npu_init import *
-import collections
-import random
-import nltk
-import tensorflow.compat.v1 as tf
-import tqdm
-from smith import utils
-from smith import wiki_doc_pair_pb2
-from smith.bert import tokenization
-
-flags = tf.flags
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("input_file", None, "Input data path.")
-
-flags.DEFINE_string(
-    "output_file", None,
-    "Output TF examples (or comma-separated list of files) in TFRecord "
-    "files.")
-
-flags.DEFINE_string("vocab_file", None,
-                    "The vocabulary file that the SMITH model was trained on.")
-
-flags.DEFINE_bool(
-    "do_lower_case", True,
-    "Whether to lower case the input text. Should be True for uncased "
-    "models and False for cased models.")
-
-flags.DEFINE_bool("add_masks_lm", True,
-                  "If true, add masks for word prediction LM pre-training.")
-
-flags.DEFINE_integer(
-    "max_sent_length_by_word", 32, "The maximum length of a sentence by tokens."
-    "A sentence will be cut off if longer than this length, and will be padded "
-    "if shorter than it. The sentence can also be a sentence block.")
-
-flags.DEFINE_integer(
-    "max_doc_length_by_sentence", 64,
-    "The maximum length of a document by sentences. A "
-    "document will be cut off if longer than this length, and"
-    "will be padded if shorter than it.")
-
-flags.DEFINE_bool(
-    "greedy_sentence_filling", True,
-    "If true, apply the greedy sentence filling trick to reduce the "
-    "number of padded tokens.")
-
-flags.DEFINE_integer("max_predictions_per_seq", 5,
-                     "Maximum number of masked LM predictions per sequence.")
-
-flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
-
-flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
-
-
-class TrainingInstance(object):
-  """A single training instance (sentence pair as dual encoder model inputs)."""
-
-  def __init__(self,
-               tokens_1,
-               segment_ids_1,
-               masked_lm_positions_1,
-               masked_lm_labels_1,
-               input_mask_1,
-               masked_lm_weights_1,
-               tokens_2,
-               segment_ids_2,
-               masked_lm_positions_2,
-               masked_lm_labels_2,
-               input_mask_2,
-               masked_lm_weights_2,
-               instance_id,
-               documents_match_labels=-1.0):
-    self.tokens_1 = tokens_1
-    self.segment_ids_1 = segment_ids_1
-    self.masked_lm_positions_1 = masked_lm_positions_1
-    self.masked_lm_labels_1 = masked_lm_labels_1
-    self.input_mask_1 = input_mask_1
-    self.masked_lm_weights_1 = masked_lm_weights_1
-    self.tokens_2 = tokens_2
-    self.segment_ids_2 = segment_ids_2
-    self.masked_lm_positions_2 = masked_lm_positions_2
-    self.masked_lm_labels_2 = masked_lm_labels_2
-    self.input_mask_2 = input_mask_2
-    self.masked_lm_weights_2 = masked_lm_weights_2
-    self.instance_id = instance_id
-    self.documents_match_labels = documents_match_labels
-
-  def __str__(self):
-    s = ""
-    s += "instance_id: %s\n" % self.instance_id
-    s += "documents_match_labels: %s\n" % (str(self.documents_match_labels))
-    s += "tokens_1: %s\n" % (" ".join(
-        [tokenization.printable_text(x) for x in self.tokens_1]))
-    s += "segment_ids_1: %s\n" % (" ".join([str(x) for x in self.segment_ids_1
-                                           ]))
-    s += "masked_lm_positions_1: %s\n" % (" ".join(
-        [str(x) for x in self.masked_lm_positions_1]))
-    s += "masked_lm_labels_1: %s\n" % (" ".join(
-        [tokenization.printable_text(x) for x in self.masked_lm_labels_1]))
-    s += "input_mask_1: %s\n" % (" ".join([str(x) for x in self.input_mask_1]))
-    s += "masked_lm_weights_1: %s\n" % (" ".join(
-        [str(x) for x in self.masked_lm_weights_1]))
-    s += "tokens_2: %s\n" % (" ".join(
-        [tokenization.printable_text(x) for x in self.tokens_2]))
-    s += "segment_ids_2: %s\n" % (" ".join([str(x) for x in self.segment_ids_2
-                                           ]))
-    s += "masked_lm_positions_2: %s\n" % (" ".join(
-        [str(x) for x in self.masked_lm_positions_2]))
-    s += "masked_lm_labels_2: %s\n" % (" ".join(
-        [tokenization.printable_text(x) for x in self.masked_lm_labels_2]))
-    s += "input_mask_2: %s\n" % (" ".join([str(x) for x in self.input_mask_2]))
-    s += "masked_lm_weights_2: %s\n" % (" ".join(
-        [str(x) for x in self.masked_lm_weights_2]))
-    s += "\n"
-    return s
-
-  def __repr__(self):
-    return self.__str__()
-
-
-def add_features_for_one_doc(features, tokens, segment_ids, input_mask,
-                             masked_lm_positions, masked_lm_labels,
-                             masked_lm_weights, tokenizer, doc_index):
-  """Add features for one document in a WikiDocPair example."""
-  input_ids = tokenizer.convert_tokens_to_ids(tokens)
-  features["input_ids_" + doc_index] = utils.create_int_feature(input_ids)
-  features["input_mask_" + doc_index] = utils.create_int_feature(input_mask)
-  features["segment_ids_" + doc_index] = utils.create_int_feature(segment_ids)
-
-  if masked_lm_labels:
-    masked_lm_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)
-    features["masked_lm_positions_" +
-             doc_index] = utils.create_int_feature(masked_lm_positions)
-    features["masked_lm_ids_" +
-             doc_index] = utils.create_int_feature(masked_lm_ids)
-    features["masked_lm_weights_" +
-             doc_index] = utils.create_float_feature(masked_lm_weights)
-
-
-def write_instance_to_example_files(instances, tokenizer, output_files):
-  """Create TF example files from `TrainingInstance`s."""
-  writers = []
-  for output_file in output_files:
-    writers.append(tf.python_io.TFRecordWriter(output_file))
-  writer_index = 0
-  total_written = 0
-  for (inst_index, instance) in enumerate(instances):
-    features = collections.OrderedDict()
-    add_features_for_one_doc(
-        features=features,
-        tokens=instance.tokens_1,
-        segment_ids=instance.segment_ids_1,
-        input_mask=instance.input_mask_1,
-        masked_lm_positions=instance.masked_lm_positions_1,
-        masked_lm_labels=instance.masked_lm_labels_1,
-        masked_lm_weights=instance.masked_lm_weights_1,
-        tokenizer=tokenizer,
-        doc_index="1")
-    add_features_for_one_doc(
-        features=features,
-        tokens=instance.tokens_2,
-        segment_ids=instance.segment_ids_2,
-        input_mask=instance.input_mask_2,
-        masked_lm_positions=instance.masked_lm_positions_2,
-        masked_lm_labels=instance.masked_lm_labels_2,
-        masked_lm_weights=instance.masked_lm_weights_2,
-        tokenizer=tokenizer,
-        doc_index="2")
-    # Adds fields on more content/id information of the current example.
-    features["instance_id"] = utils.create_bytes_feature(
-        [bytes(instance.instance_id, "utf-8")])
-    features["tokens_1"] = utils.create_bytes_feature(
-        [bytes(t, "utf-8") for t in instance.tokens_1])
-    features["tokens_2"] = utils.create_bytes_feature(
-        [bytes(t, "utf-8") for t in instance.tokens_2])
-    # Adds the documents matching labels.
-    features["documents_match_labels"] = utils.create_float_feature(
-        [float(instance.documents_match_labels)])
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-
-    writers[writer_index].write(tf_example.SerializeToString())
-    writer_index = (writer_index + 1) % len(writers)
-
-    total_written += 1
-
-    if inst_index < 5:
-      tf.logging.info("*** Example ***")
-      tf.logging.info(
-          "tokens_1: %s" %
-          " ".join([tokenization.printable_text(x) for x in instance.tokens_1]))
-      tf.logging.info(
-          "tokens_2: %s" %
-          " ".join([tokenization.printable_text(x) for x in instance.tokens_2]))
-
-      for feature_name in features.keys():
-        feature = features[feature_name]
-        values = []
-        if feature.int64_list.value:
-          values = feature.int64_list.value
-        elif feature.float_list.value:
-          values = feature.float_list.value
-        elif feature.bytes_list.value:
-          values = feature.bytes_list.value
-        tf.logging.info("%s: %s" %
-                        (feature_name, " ".join([str(x) for x in values])))
-
-  for writer in writers:
-    writer.close()
-
-  tf.logging.info("Wrote %d total instances", total_written)
-
-
-def get_smith_model_tokens(input_text, tokenizer, sent_token_counter):
-  """Generate tokens given an input text for the SMITH model."""
-  res_tokens = []
-  for sent in nltk.tokenize.sent_tokenize(input_text):
-    # The returned res_tokens is a 2D list to maintain the sentence boundary
-    # information. We removed all the empty tokens in this step.
-    if not sent:
-      continue
-    tokens = [w for w in tokenizer.tokenize(sent) if w]
-    sent_token_counter[0] += 1  # Track number of sentences.
-    sent_token_counter[1] += len(tokens)  # Track number of tokens.
-    res_tokens.append(tokens)
-  return (res_tokens, sent_token_counter)
-
-
-def create_training_instances_wiki_doc_pair(
-    input_file, tokenizer, max_sent_length_by_word, max_doc_length_by_sentence,
-    masked_lm_prob, max_predictions_per_seq, rng):
-  """Create `TrainingInstance`s from WikiDocPair proto data."""
-  # The input data is in the WikiDocPair proto format in tfrecord.
-  # Add by:TC
-  wiki_doc_pair = wiki_doc_pair_pb2.WikiDocPair()
-  instances = []
-  # Add some counters to track some data statistics.
-  sent_token_counter = [0, 0]
-  for example in tqdm.tqdm(tf.python_io.tf_record_iterator(input_file)):
-    doc_pair = wiki_doc_pair.FromString(example)
-    # If model_name = smith_dual_encoder, we firstly use a sentence tokenizer
-    # to split doc_one/doc_two texts into different sentences and use [SEN] to
-    # label the sentence boundary information. So in the masking and padding
-    # step, we know the boundary between different sentences and we can do the
-    # masking and padding according to the actual length of each sentence.
-    doc_one_text = " \n\n\n\n\n\n ".join(
-        [a.text for a in doc_pair.doc_one.section_contents])
-    doc_two_text = " \n\n\n\n\n\n ".join(
-        [a.text for a in doc_pair.doc_two.section_contents])
-    doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip()
-    doc_two_text = tokenization.convert_to_unicode(doc_two_text).strip()
-    doc_one_tokens, sent_token_counter = get_smith_model_tokens(
-        doc_one_text, tokenizer, sent_token_counter)
-    doc_two_tokens, sent_token_counter = get_smith_model_tokens(
-        doc_two_text, tokenizer, sent_token_counter)
-    # Skip the document pairs if any document is empty.
-    if not doc_one_tokens or not doc_two_tokens:
-      continue
-    vocab_words = list(tokenizer.vocab.keys())
-    instance_id = doc_pair.id
-    if doc_pair.human_label_for_classification:
-      doc_match_label = doc_pair.human_label_for_classification
-    else:
-      # Set the label as 0.0 if there are no available labels.
-      doc_match_label = 0.0
-    instances.append(
-        create_instance_from_wiki_doc_pair(
-            instance_id, doc_match_label, doc_one_tokens, doc_two_tokens,
-            max_sent_length_by_word, max_doc_length_by_sentence, masked_lm_prob,
-            max_predictions_per_seq, vocab_words, rng))
-  rng.shuffle(instances)
-  return (instances, sent_token_counter)
-
-
-def create_instance_from_wiki_doc_pair(instance_id, doc_match_label,
-                                       doc_one_tokens, doc_two_tokens,
-                                       max_sent_length_by_word,
-                                       max_doc_length_by_sentence,
-                                       masked_lm_prob, max_predictions_per_seq,
-                                       vocab_words, rng):
-  """Creates `TrainingInstance`s for a WikiDocPair input data."""
-  (tokens_1, segment_ids_1, masked_lm_positions_1, masked_lm_labels_1, \
-   input_mask_1, masked_lm_weights_1) = \
-      get_tokens_segment_ids_masks(max_sent_length_by_word, max_doc_length_by_sentence, doc_one_tokens, masked_lm_prob,
-                                   max_predictions_per_seq, vocab_words, rng)
-  (tokens_2, segment_ids_2, masked_lm_positions_2, masked_lm_labels_2, \
-   input_mask_2, masked_lm_weights_2) = \
-      get_tokens_segment_ids_masks(max_sent_length_by_word, max_doc_length_by_sentence, doc_two_tokens, masked_lm_prob,
-                                   max_predictions_per_seq, vocab_words, rng)
-  instance = TrainingInstance(
-      tokens_1=tokens_1,
-      segment_ids_1=segment_ids_1,
-      masked_lm_positions_1=masked_lm_positions_1,
-      masked_lm_labels_1=masked_lm_labels_1,
-      input_mask_1=input_mask_1,
-      masked_lm_weights_1=masked_lm_weights_1,
-      tokens_2=tokens_2,
-      segment_ids_2=segment_ids_2,
-      masked_lm_positions_2=masked_lm_positions_2,
-      masked_lm_labels_2=masked_lm_labels_2,
-      input_mask_2=input_mask_2,
-      masked_lm_weights_2=masked_lm_weights_2,
-      instance_id=instance_id,
-      documents_match_labels=doc_match_label)
-  return instance
-
-
-def get_tokens_segment_ids_masks(max_sent_length_by_word,
-                                 max_doc_length_by_sentence, doc_one_tokens,
-                                 masked_lm_prob, max_predictions_per_seq,
-                                 vocab_words, rng):
-  """Get the tokens, segment ids and masks of an input sequence."""
-  # The format of tokens for SMITH dual encoder models is like:
-  # [CLS] block1_token1 block1_token2 block1_token3 ... [SEP] [SEP] [PAD] ...
-  # [CLS] block2_token1 block2_token2 block2_token3 ... [SEP] [SEP] [PAD] ...
-  # [CLS] block3_token1 block3_token2 block3_token3 ... [SEP] [SEP] [PAD] ...
-  # If max_sent_length_by_word is large, then there will be many padded
-  # words in the sentence. Here we added an optional "greedy sentence filling"
-  # trick in order to reduce the number of padded words and maintain all
-  # content in the document. We allow a "sentence" block to contain more than
-  # one natural sentence and try to fill as many as sentences into the
-  # "sentence" block. If a sentence will be cut off and the current sentence
-  # block is not empty, we will put the sentence into the next "sentence" block.
-  # According to ALBERT paper and RoBERTa paper, a segment is usually comprised
-  # of more than one natural sentence, which has been shown to benefit
-  # performance. doc_one_tokens is a 2D list which contains the sentence
-  # boundary information.
-  sentence_num = len(doc_one_tokens)
-  # sent_block_token_list is a 2D list to maintain sentence block tokens.
-  sent_block_token_list = []
-  natural_sentence_index = -1
-  while natural_sentence_index + 1 < sentence_num:
-    natural_sentence_index += 1
-    sent_tokens = doc_one_tokens[natural_sentence_index]
-    if not sent_tokens:
-      continue
-    if FLAGS.greedy_sentence_filling:
-      cur_sent_block_length = 0
-      cur_sent_block = []
-      # Fill as many senteces as possible in the current sentence block in a
-      # greedy way.
-      while natural_sentence_index < sentence_num:
-        cur_natural_sent_tokens = doc_one_tokens[natural_sentence_index]
-        if not cur_natural_sent_tokens:
-          natural_sentence_index += 1
-          continue
-        cur_sent_len = len(cur_natural_sent_tokens)
-        if ((cur_sent_block_length + cur_sent_len) <=
-            (max_sent_length_by_word - 3)) or cur_sent_block_length == 0:
-          # One exceptional case here is that if the 1st sentence of a sentence
-          # block is already going across the boundary, then the current
-          # sentence block will be empty. So when cur_sent_block_length is 0
-          # and we meet a natural sentence with length longer than
-          # (max_sent_length_by_word - 3), we still put this natural sentence
-          # in the current sentence block. In this case, this long natural
-          # sentence will be cut off with the final length up to
-          # (max_sent_length_by_word - 3).
-          cur_sent_block.extend(cur_natural_sent_tokens)
-          cur_sent_block_length += cur_sent_len
-          natural_sentence_index += 1
-        else:
-          # If cur_sent_block_length + cur_sent_len > max_sent_length_by_word-3
-          # and the current sentence block is not empty, the sentence which
-          # goes across the boundary will be put into the next sentence block.
-          natural_sentence_index -= 1
-          break
-    sent_tokens = cur_sent_block
-    sent_block_token_list.append(sent_tokens)
-    if len(sent_block_token_list) >= max_doc_length_by_sentence:
-      break  # Skip more sentence blocks if the document is too long.
-  # For each sentence block, generate the token sequences, masks and paddings.
-  tokens_doc = []
-  segment_ids_doc = []
-  masked_lm_positions_doc = []
-  masked_lm_labels_doc = []
-  input_mask_doc = []
-  masked_lm_weights_doc = []
-  for block_index in range(len(sent_block_token_list)):
-    tokens_block, segment_ids_block, masked_lm_positions_block, \
-    masked_lm_labels_block, input_mask_block, masked_lm_weights_block = \
-        get_token_masks_paddings(
-            sent_block_token_list[block_index],
-            max_sent_length_by_word,
-            masked_lm_prob,
-            max_predictions_per_seq,
-            vocab_words,
-            rng,
-            block_index)
-    tokens_doc.extend(tokens_block)
-    segment_ids_doc.extend(segment_ids_block)
-    masked_lm_positions_doc.extend(masked_lm_positions_block)
-    masked_lm_labels_doc.extend(masked_lm_labels_block)
-    input_mask_doc.extend(input_mask_block)
-    masked_lm_weights_doc.extend(masked_lm_weights_block)
-
-  # Pad sentence blocks if the actual number of sentence blocks is less than
-  # max_doc_length_by_sentence.
-  sentence_block_index = len(sent_block_token_list)
-  while sentence_block_index < max_doc_length_by_sentence:
-    for _ in range(max_sent_length_by_word):
-      tokens_doc.append("[PAD]")
-      segment_ids_doc.append(0)
-      input_mask_doc.append(0)
-    for _ in range(max_predictions_per_seq):
-      masked_lm_positions_doc.append(0)
-      masked_lm_labels_doc.append("[PAD]")
-      masked_lm_weights_doc.append(0.0)
-    sentence_block_index += 1
-  assert len(tokens_doc) == max_sent_length_by_word * max_doc_length_by_sentence
-  assert len(masked_lm_labels_doc
-            ) == max_predictions_per_seq * max_doc_length_by_sentence
-  return (tokens_doc, segment_ids_doc, masked_lm_positions_doc,
-          masked_lm_labels_doc, input_mask_doc, masked_lm_weights_doc)
-
-
-def get_token_masks_paddings(block_tokens, max_sent_length_by_word,
-                             masked_lm_prob, max_predictions_per_seq,
-                             vocab_words, rng, block_index):
-  """Generates tokens, masks and paddings for the input block tokens."""
-  # Account for [CLS], [SEP], [SEP]
-  max_num_tokens = max_sent_length_by_word - 3
-  # Truncates the sequence if sequence length is longer than max_num_tokens.
-  tokens = []
-  segment_ids = []
-  if len(block_tokens) > max_num_tokens:
-    block_tokens = block_tokens[0:max_num_tokens]
-  tokens_a = block_tokens
-  tokens.append("[CLS]")
-  segment_ids.append(0)
-  for token in tokens_a:
-    tokens.append(token)
-    segment_ids.append(0)
-  tokens.append("[SEP]")
-  segment_ids.append(0)
-  tokens.append("[SEP]")
-  segment_ids.append(0)
-  masked_lm_positions = []
-  masked_lm_labels = []
-  masked_lm_weights = []
-  if max_predictions_per_seq > 0:
-    (tokens, masked_lm_positions,
-     masked_lm_labels) = utils.create_masked_lm_predictions(
-         tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
-  # Add [PAD] to tokens and masked LM related lists.
-  input_mask = [1] * len(tokens)
-  while len(tokens) < max_sent_length_by_word:
-    tokens.append("[PAD]")
-    input_mask.append(0)
-    segment_ids.append(0)
-
-  assert len(tokens) == max_sent_length_by_word
-  assert len(input_mask) == max_sent_length_by_word
-  assert len(segment_ids) == max_sent_length_by_word
-
-  if max_predictions_per_seq > 0:
-    # Transfer local positions in masked_lm_positions to global positions in the
-    # whole document to be consistent with the model training pipeline.
-    masked_lm_positions = [
-        (i + max_sent_length_by_word * block_index) for i in masked_lm_positions
-    ]
-    masked_lm_weights = [1.0] * len(masked_lm_labels)
-
-    while len(masked_lm_positions) < max_predictions_per_seq:
-      masked_lm_positions.append(0)
-      masked_lm_labels.append("[PAD]")
-      masked_lm_weights.append(0.0)
-  return (tokens, segment_ids, masked_lm_positions, masked_lm_labels,
-          input_mask, masked_lm_weights)
-
-
-def main(_):
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  tokenizer = tokenization.FullTokenizer(
-      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
-
-  input_files = []
-  for input_pattern in FLAGS.input_file.split(","):
-    input_files.extend(tf.gfile.Glob(input_pattern))
-
-  tf.logging.info("*** Reading from input files ***")
-  for input_file in input_files:
-    tf.logging.info("  %s", input_file)
-  rng = random.Random(FLAGS.random_seed)
-  # Creates training instances.
-  max_predictions_per_seq = FLAGS.max_predictions_per_seq if FLAGS.add_masks_lm else 0
-  masked_lm_prob = FLAGS.masked_lm_prob if FLAGS.add_masks_lm else 0
-  instances, sent_token_counter = create_training_instances_wiki_doc_pair(
-      input_file=FLAGS.input_file,
-      tokenizer=tokenizer,
-      max_sent_length_by_word=FLAGS.max_sent_length_by_word,
-      max_doc_length_by_sentence=FLAGS.max_doc_length_by_sentence,
-      masked_lm_prob=masked_lm_prob,
-      max_predictions_per_seq=max_predictions_per_seq,
-      rng=rng)
-
-  output_files = FLAGS.output_file.split(",")
-  tf.logging.info("*** Writing to output files ***")
-  for output_file in output_files:
-    tf.logging.info("  %s", output_file)
-
-  # Transfers training instances into tensorflow examples and write the results.
-  write_instance_to_example_files(instances, tokenizer, output_files)
-
-  # Finally outputs some data statistics.
-  tf.logging.info("sent_count, token_count, doc_pair_count: %d %d %d",
-                  sent_token_counter[0], sent_token_counter[1], len(instances))
-
-
-if __name__ == "__main__":
-  flags.mark_flag_as_required("input_file")
-  flags.mark_flag_as_required("output_file")
-  flags.mark_flag_as_required("vocab_file")
-  tf.app.run()
-
diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py
deleted file mode 100644
index f4b828c75..000000000
--- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2021 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from npu_bridge.npu_init import *
-import random
-import tempfile
-
-from absl import flags
-import tensorflow.compat.v1 as tf
-
-from smith import preprocessing_smith
-from smith.bert import tokenization
-
-FLAGS = flags.FLAGS
-
-
-class PreprocessingSmithTest(tf.test.TestCase):
-
-  def setUp(self):
-    super(PreprocessingSmithTest, self).setUp()
-    doc_one_text = (
-        "I am in Dominick's for my dinner. OK, no problem. I am "
-        "in Dominick's for my dinner which is the best dinner I have "
-        "in my whole life.")
-    doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip()
-    vocab_tokens = [
-        "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "i", "am", "in", "for",
-        "my", "dinner", "ok", "no", "problem", "which", "is", "the", "be",
-        "##s", "##t", ","
-    ]
-    with tempfile.NamedTemporaryFile(delete=False) as vocab_writer:
-      vocab_writer.write("".join([x + "\n" for x in vocab_tokens
-                                 ]).encode("utf-8"))
-      self.vocab_file = vocab_writer.name
-    self.tokenizer = tokenization.FullTokenizer(
-        vocab_file=self.vocab_file, do_lower_case=True)
-    self.vocab_words = list(self.tokenizer.vocab.keys())
-    self.rng = random.Random(12345)
-    self.doc_one_tokens, _ = preprocessing_smith.get_smith_model_tokens(
-        doc_one_text, self.tokenizer, [0, 0])
-    self.max_sent_length_by_word = 20
-    self.max_doc_length_by_sentence = 3
-    self.greedy_sentence_filling = True
-    self.max_predictions_per_seq = 0
-    self.masked_lm_prob = 0
-
-  def test_get_tokens_segment_ids_masks(self):
-    (tokens_1, segment_ids_1, _, _, input_mask_1, _) = \
-    preprocessing_smith.get_tokens_segment_ids_masks(
-        max_sent_length_by_word=self.max_sent_length_by_word,
-        max_doc_length_by_sentence=self.max_doc_length_by_sentence,
-        doc_one_tokens=self.doc_one_tokens,
-        masked_lm_prob=self.masked_lm_prob,
-        max_predictions_per_seq=self.max_predictions_per_seq,
-        vocab_words=self.vocab_words,
-        rng=self.rng)
-    self.assertEqual(tokens_1, [
-        "[CLS]", "i", "am", "in", "[UNK]", "[UNK]", "[UNK]", "for", "my",
-        "dinner", "[UNK]", "ok", ",", "no", "problem", "[UNK]", "[SEP]",
-        "[SEP]", "[PAD]", "[PAD]", "[CLS]", "i", "am", "in", "[UNK]", "[UNK]",
-        "[UNK]", "for", "my", "dinner", "which", "is", "the", "be", "##s",
-        "##t", "dinner", "i", "[SEP]", "[SEP]", "[PAD]", "[PAD]", "[PAD]",
-        "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]",
-        "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]",
-        "[PAD]"
-    ])
-    self.assertEqual(segment_ids_1, [
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-    ])
-    self.assertEqual(input_mask_1, [
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
-        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-    ])
-
-
-if __name__ == "__main__":
-  tf.test.main()
-
-- 
Gitee


From 9c09b647738c4186473e3c703426223bbbafe145 Mon Sep 17 00:00:00 2001
From: QiuYao <qiuyao4@hisilicon.com>
Date: Tue, 11 Oct 2022 19:51:23 +0800
Subject: [PATCH 3/3] update file

---
 .../nlp/smith_ID2025_for_ACL/README.md        |  3 +-
 .../contrib/nlp/smith_ID2025_for_ACL/atc.sh   |  3 +-
 .../nlp/smith_ID2025_for_ACL/ckpt2pb.py       | 48 ++++++++++++++-----
 .../nlp/smith_ID2025_for_ACL/ckpt2pb.sh       |  2 +-
 .../gen_bin_by_img2bin.sh                     |  3 +-
 .../contrib/nlp/smith_ID2025_for_ACL/msame.sh |  3 +-
 6 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md
index f638d40a5..ccba07f62 100644
--- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md
+++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md
@@ -109,4 +109,5 @@ python3 img2bin.py -i ./input_mask_2.txt -t int32 -o ./out/
 {"predicted_score": "0.5", "predicted_class": "0.0"}
 {"predicted_score": "0.5", "predicted_class": "0.0"}
 {"predicted_score": "0.9975251", "predicted_class": "1.0"}
-{"predicted_score": "0.99752605", "predicted_class": "1.0"}
\ No newline at end of file
+{"predicted_score": "0.99752605", "predicted_class": "1.0"}
+
diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh
index b7c416779..6d15ed595 100644
--- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh
+++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh
@@ -1 +1,2 @@
-atc --model=smith.pb --framework=3 --output=pb_res --soc_version=Ascend910 --input_shape="input_ids_1:32,2048;input_mask_1:32,2048;input_ids_2:32,2048;input_mask_2:32,2048" --out_nodes="seq_rep_from_bert_doc_dense/l2_normalize_1:0;Sigmoid:0;Round:0" --log=debug
\ No newline at end of file
+atc --model=smith.pb --framework=3 --output=pb_res --soc_version=Ascend910 --input_shape="input_ids_1:32,2048;input_mask_1:32,2048;input_ids_2:32,2048;input_mask_2:32,2048" --out_nodes="seq_rep_from_bert_doc_dense/l2_normalize_1:0;Sigmoid:0;Round:0" --log=debug
+
diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py
index 8d8140ab6..a65ffcb51 100644
--- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py
+++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py
@@ -1,9 +1,33 @@
-# -*- coding: utf-8 -*- 
-"""   
-    Created on  2022/4/21 0:18
-    
-    @Author T.c  
-"""
+# coding=utf-8
+# Copyright 2021 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
 from absl import flags
 from absl import app
 import tensorflow.compat.v1 as tf
@@ -16,17 +40,19 @@ from tensorflow.python.framework import graph_util
 FLAGS = flags.FLAGS
 
 flags.DEFINE_string("dual_encoder_config_file", None, "The proto config file for dual encoder SMITH models.")
+flags.DEFINE_string("ckpt_path", None, "The NPU ckpt file.")
+flags.DEFINE_string("output_graph", "smith.pb", "The output path of pb file.")
 
-# 指定checkpoint路径
-ckpt_path = "/home/test_user06/tc_workspace/data/result_file/tc_wsp_20220920_V4/model.ckpt-10000"
+ckpt_path = FLAGS.ckpt_path
+output_graph = FLAGS.output_graph
 
 
 def main(_argv):
 
     input_ids_1 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_ids_1")
-    input_mask_1 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_1") #features["input_mask_1"]
+    input_mask_1 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_1") 
     input_ids_2 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_ids_2")
-    input_mask_2 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_2") #features["input_mask_2"]
+    input_mask_2 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_2")
     exp_config = utils.load_config_from_file(FLAGS.dual_encoder_config_file, experiment_config_pb2.DualEncoderConfig())
     tf.logging.info("*** Features ***")
     masked_lm_positions_1 = tf.zeros([1])
@@ -55,8 +81,6 @@ def main(_argv):
     graph = tf.get_default_graph()
     input_graph_def = graph.as_graph_def()
 
-    output_graph = "/home/test_user06/tc_workspace/smith_0927_del_full_dropout_27_NPU.pb"
-
     with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
         saver = tf.train.Saver()
diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh
index f4b954047..077429966 100644
--- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh
+++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh
@@ -1 +1 @@
-python3 ckpt2pb.py --dual_encoder_config_file=smith/config/dual_encoder_config.smith_wsp.32.48.pbtxt
\ No newline at end of file
+python3 ckpt2pb.py --dual_encoder_config_file=smith/config/dual_encoder_config.smith_wsp.32.48.pbtxt --ckpt_path=./model.ckpt-10000  --output_graph=./smith.pb
diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh
index a148192ea..7a74ffa30 100644
--- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh
+++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh
@@ -1,4 +1,5 @@
 python3 img2bin.py -i ./input_ids_1.txt -t int32 -o ./out/
 python3 img2bin.py -i ./input_ids_2.txt -t int32 -o ./out/
 python3 img2bin.py -i ./input_mask_1.txt -t int32 -o ./out/
-python3 img2bin.py -i ./input_mask_2.txt -t int32 -o ./out/
\ No newline at end of file
+python3 img2bin.py -i ./input_mask_2.txt -t int32 -o ./out/
+
diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh
index f9f5f37d5..b6b983348 100644
--- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh
+++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh
@@ -1 +1,2 @@
-./msame --model "pb_res.om" --input "out/out4tmp/input_ids_1.bin,out/out4tmp/input_ids_2.bin,out/out4tmp/input_mask_1.bin,out/out4tmp/input_mask_2.bin" --output "output" --loop 1 --outfmt TXT --debug true
\ No newline at end of file
+./msame --model "pb_res.om" --input "out/out4tmp/input_ids_1.bin,out/out4tmp/input_ids_2.bin,out/out4tmp/input_mask_1.bin,out/out4tmp/input_mask_2.bin" --output "output" --loop 1 --outfmt TXT --debug true
+
-- 
Gitee