From 1f09a306275e8d71263e3fc0dcc933f9266ebb40 Mon Sep 17 00:00:00 2001 From: QiuYao Date: Tue, 27 Sep 2022 20:08:25 +0800 Subject: [PATCH 1/3] init add atc code for smith --- .../layers_orig.py | 570 ++++++++++++++++++ .../modeling_orig.py | 491 +++++++++++++++ .../preprocessing_smith_orig.py | 547 +++++++++++++++++ .../preprocessing_smith_test.py | 21 +- .../preprocessing_smith_test_orig.py | 108 ++++ 5 files changed, 1730 insertions(+), 7 deletions(-) create mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py create mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py create mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py create mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py new file mode 100644 index 000000000..7f1254935 --- /dev/null +++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py @@ -0,0 +1,570 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Model layers in dual encoder SMITH model.""" +from npu_bridge.npu_init import * +from six.moves import range +from npu_bridge.estimator.npu import npu_convert_dropout +import tensorflow.compat.v1 as tf + +from smith import constants +from smith.bert import modeling + + +def get_doc_rep_with_masked_sent(input_sent_reps_doc, + sent_mask_embedding, + input_mask_doc_level, + batch_size_static=32, + max_masked_sent_per_doc=2, + loop_sent_number_per_doc=32): + """Get the document representations with masked sentences. + + Args: + input_sent_reps_doc: float Tensor. The independent sentence embeddings + without masks for the sentences in the current document. The shape is + [batch, loop_sent_number_per_doc, hidden]. + sent_mask_embedding: float Tensor. The sentence embedding vector for the + masked position. The shape is [hidden]. + input_mask_doc_level: int Tensor. The input masks on the document level to + identify whether a location is a real sentence (mask = 1) or a padded + sentence (mask = 0). The shape is [batch, loop_sent_number_per_doc]. + batch_size_static: scalar. The static batch size depending on the training + or the evaluation mode. + max_masked_sent_per_doc: scalar. The maximum number of masked sentences + per document. + loop_sent_number_per_doc: scalar. The number of looped sentences per + document. + + Returns: + The document representations with masked sentences and the positions/ + weights for each masked sentences. This masked sentence weight is 1 for the + sampled real sentence position and 0 for the padded sentence position. + """ + # We at least mask two sentences to build a candidate sentence pool for + # negative sentence sampling. We generate the masked_sent_index and + # masked_sent_weight for each document. Note that we do not add any word + # or sentence level masks during prediction or inference stage. + max_masked_sent_per_doc = max(max_masked_sent_per_doc, 2) + input_sent_reps_doc_list = tf.unstack( + input_sent_reps_doc, num=batch_size_static) + real_sent_number_per_doc = tf.unstack( + tf.reduce_sum(input_mask_doc_level, 1), num=batch_size_static) + masked_sent_index_list = [] + masked_sent_weight_list = [] + + # For each example in the current batch, we randomly sample + # max_masked_sent_per_doc positions to mask the sentences. For each masked + # sentence position, the sentence in the current position is the positive + # example. The other co-masked sentences are the negative examples. + # The sampled sentence indexes will not be duplicated. + for batch_i in range(0, batch_size_static): + # Since everything in TPU must have a fixed shape, here the max sampled + # sentence index can be as large as loop_sent_number_per_doc. We will + # generate the corresponding sentence LM weights to reduce the impact + # on the final masked sentence LM loss following a similar way with the + # handling of masked word LM loss and masked word LM weights. + real_sent_number = real_sent_number_per_doc[batch_i] + sampled_sent_index = tf.slice( + tf.random_shuffle(tf.range(loop_sent_number_per_doc)), [0], + [max_masked_sent_per_doc]) + sampled_sent_index = tf.sort(sampled_sent_index) + masked_sent_index_list.append(sampled_sent_index) + # Generates the corresponding sampled_sent_weight + sample_sent_weight = tf.cast( + tf.less(sampled_sent_index, real_sent_number), tf.float32) + masked_sent_weight_list.append(sample_sent_weight) + + indices = tf.reshape(sampled_sent_index, [max_masked_sent_per_doc, -1]) + # Duplicates sent_mask_embedding for each masked position. + updates = tf.reshape( + tf.tile( + sent_mask_embedding, + [max_masked_sent_per_doc], + ), [max_masked_sent_per_doc, -1]) + input_sent_reps_doc_list[batch_i] = tf.tensor_scatter_update( + input_sent_reps_doc_list[batch_i], indices, updates) + # Here masked_sent_index_list is a list a tensors, where each tensor stores + # the masked sentence positions for each document in the current batch. The + # shape of masked_sent_index_list is [batch, max_masked_sent_per_doc]. + # Here masked_sent_weight_list is a list a tensors, where each tensor stores + # the masked sentence weights for each document in the current batch. The + # shape of masked_sent_weight_list is [batch, max_masked_sent_per_doc]. + return (tf.stack(input_sent_reps_doc_list), tf.stack(masked_sent_index_list), + tf.stack(masked_sent_weight_list)) + + +def get_masked_sent_lm_output(bert_config, + input_tensor, + cur_sent_reps_doc_unmask, + sent_masked_positions, + sent_masked_weights, + debugging=False): + """Get the sentence level masked LM loss. + + Args: + bert_config: BertConfig object. The configuration file for the document + level BERT model. + input_tensor: float Tensor. The contextualized representations of all + sentences learned by the document level BERT model. The shape is [batch, + loop_sent_number_per_doc, hidden]. This is the model prediction. + cur_sent_reps_doc_unmask: float Tensor. The unmasked sentence + representations of the current document. The shape is [batch, + loop_sent_number_per_doc, hidden]. This is the source of the ground + truth and negative examples in the masked sentence prediction. + sent_masked_positions: int Tensor. The masked sentence positions in the + current document. The shape is [batch, max_masked_sent_per_doc]. + sent_masked_weights: float Tensor. The masked sentence weights in the + current document. The shape is [batch, max_masked_sent_per_doc]. + debugging: bool. Whether it is in the debugging mode. + + Returns: + The masked sentence LM loss and the mask sentence LM loss per example. + + """ + # The current method for masked sentence prediction: we approach this problem + # as a multi-class classification problem similar to the masked word LM task. + # For each masked sentence position, the sentence in the current position is + # the positive example. The other co-masked sentences in the current document + # and in the other documents of the same batch are the negative examples. We + # compute the cross entropy loss over the sentence prediction task following + # the implementation of the masked word LM loss in the BERT model. + + input_tensor_shape = modeling.get_shape_list(input_tensor) + batch_size = input_tensor_shape[0] + masked_position_shape = modeling.get_shape_list(sent_masked_positions) + max_predictions_per_seq = masked_position_shape[1] + + # In the context of masked sentence prediction, the max_predictions_per_seq + # is the same with max_masked_sent_per_doc. + # Output Shape: [batch * max_predictions_per_seq, hidden]. + # Input_tensor is the model prediction for each position. + input_tensor = gather_indexes(input_tensor, sent_masked_positions) + # Independent_sent_embeddings is the ground truth input sentence embeddings + # for the document level BERT model. The output shape is [batch * + # max_predictions_per_seq, hidden]. + independent_sent_embeddings = gather_indexes(cur_sent_reps_doc_unmask, + sent_masked_positions) + + with tf.variable_scope("cls/sent_predictions", reuse=tf.AUTO_REUSE): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=bert_config.hidden_size, + activation=modeling.get_activation(bert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + # Output Shape: [batch * max_predictions_per_seq, hidden]. + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each predicted position. + output_bias = tf.get_variable( + "output_bias", + shape=[batch_size * max_predictions_per_seq], + initializer=tf.zeros_initializer()) + # Shape of input_tensor [batch * max_predictions_per_seq, hidden]. + # Shape of independent_sent_embeddings is [batch * max_predictions_per_seq, + # hidden]. + # Shape of logits: [batch * max_predictions_per_seq, + # batch * max_predictions_per_seq]. + logits = tf.matmul( + input_tensor, independent_sent_embeddings, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + # Output Shape: [batch * max_predictions_per_seq, + # batch * max_predictions_per_seq]. + log_probs = tf.nn.log_softmax(logits, axis=-1) + + # Output Shape: [batch * max_predictions_per_seq]. + # Double checked the setting of label_ids here. The label_ids + # should be the label index in the "sentence vocabulary". Thus if batch=32, + # max_predictions_per_seq = 2, then label ids should be like + # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ..., 63]. For the ground truth one hot + # label matrix, only the values in the diagonal positions are 1. All the + # other positions should be 0. + label_ids = tf.range( + 0, batch_size * max_predictions_per_seq, dtype=tf.int32) + if debugging: + label_ids = tf.Print( + label_ids, [label_ids], + message="label_ids in get_masked_sent_lm_output", + summarize=30) + # Output Shape: [batch * max_predictions_per_seq]. + # The label_weights is the flatten vector based on sent_masked_weights, + # where the weight is 1.0 for sampled real sentences and 0.0 for sampled + # masked sentences. + label_weights = tf.reshape(sent_masked_weights, [-1]) + + # Output Shape: [batch * max_predictions_per_seq, + # batch * max_predictions_per_seq]. + one_hot_labels = tf.one_hot( + label_ids, depth=batch_size * max_predictions_per_seq, dtype=tf.float32) + + # Output Shape: [batch * max_predictions_per_seq]. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + # Output Shape: [1]. + numerator = tf.reduce_sum(label_weights * per_example_loss) + # Output Shape: [1]. + denominator = tf.reduce_sum(label_weights) + 1e-5 + # Output Shape: [1]. + loss = numerator / denominator + # Shape of loss [1]. + # Shape of per_example_loss is [batch * max_predictions_per_seq]. + return (loss, per_example_loss, log_probs) + + +def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, + label_ids, label_weights): + """Get loss and log probs for the masked LM.""" + # Output Shape: [batch * max_predictions_per_seq, hidden]. + input_tensor = gather_indexes(input_tensor, positions) + + with tf.variable_scope("cls/word_predictions", reuse=tf.AUTO_REUSE): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=bert_config.hidden_size, + activation=modeling.get_activation(bert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + # Output Shape: [batch * max_predictions_per_seq, hidden]. + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[bert_config.vocab_size], + initializer=tf.zeros_initializer()) + # Shape of input_tensor [batch * max_predictions_per_seq, embedding_size]. + # Shape of output_weights (embed table) is [vocab_size, embedding_size]. + # In the current Bert implementation: embedding_size = hidden. + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + # Output Shape: [batch * max_predictions_per_seq, vocab_size]. + log_probs = tf.nn.log_softmax(logits, axis=-1) + + # Output Shape: [batch * max_predictions_per_seq]. + label_ids = tf.reshape(label_ids, [-1]) + # Output Shape: [batch * max_predictions_per_seq]. + label_weights = tf.reshape(label_weights, [-1]) + + # Output Shape: [batch * max_predictions_per_seq, vocab_size]. + one_hot_labels = tf.one_hot( + label_ids, depth=bert_config.vocab_size, dtype=tf.float32) + + # The `positions` tensor might be zero-padded (if the sequence is too + # short to have the maximum number of predictions). The `label_weights` + # tensor has a value of 1.0 for every real prediction and 0.0 for the + # padding predictions. + # Output Shape: [batch * max_predictions_per_seq]. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + # Output Shape: [1]. + numerator = tf.reduce_sum(label_weights * per_example_loss) + # Output Shape: [1]. + denominator = tf.reduce_sum(label_weights) + 1e-5 + # Output Shape: [1]. + loss = numerator / denominator + # Shape of loss [1]. + # Shape of per_example_loss is [batch * max_predictions_per_seq]. + return (loss, per_example_loss, log_probs) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + # Shape of positions: [batch, max_mask_per_seq]. + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + # Shape of flat_offsets: [batch, 1]. + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + # The shape of output_tensor [batch * max_mask_per_seq, hidden]. + return output_tensor + + +def get_attention_weighted_sum(input_tensor, bert_config, is_training, + attention_size): + """Compute the attentive weighted sum of an input tensor. + + Args: + input_tensor: The input tensor for attentive representation. The shape of + input tensor is [batch, seq_length, hidden]. + bert_config: The model config file. + is_training: If true, it is in training mode. + attention_size: int. Dimension of contextual vector. + + Returns: + The attentive representation of the input tensor. The shape of the output + tensor is [batch, hidden]. + """ + with tf.variable_scope("combine_reps_attention", reuse=tf.AUTO_REUSE): + context_vector = tf.get_variable( + name="context_vector", + shape=[attention_size], + dtype=tf.float32) + # Output Shape: [batch, seq_length, attention_size]. + projection = tf.layers.dense( + input_tensor, + attention_size, + activation=tf.tanh, + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + # Output Shape: [batch, seq_length, 1]. + attention = tf.reduce_sum( + tf.multiply(projection, context_vector), axis=2, keep_dims=True) + # Output Shape: [batch, seq_length, 1]. + attention = tf.nn.softmax(attention, axis=1) + # Output Shape: [batch, hidden]. + last_outputs = tf.reduce_sum(tf.multiply(input_tensor, attention), axis=1) + if is_training: + last_outputs = tf.layers.dropout( + last_outputs, bert_config.attention_probs_dropout_prob, training=True) + return last_outputs + + +def get_seq_rep_from_bert(bert_model): + """Get the sequence represenation given a BERT encoder.""" + siamese_input_tensor = bert_model.get_pooled_output() + hidden_size = siamese_input_tensor.shape[-1].value + siamese_input_tensor = tf.layers.dense( + siamese_input_tensor, units=hidden_size, activation=tf.nn.relu) + normalized_siamese_input_tensor = tf.nn.l2_normalize( + siamese_input_tensor, axis=1) + return normalized_siamese_input_tensor + + +def get_sent_reps_masks_normal_loop(sent_index, + input_sent_reps_doc, + input_mask_doc_level, + masked_lm_loss_doc, + masked_lm_example_loss_doc, + masked_lm_weights_doc, + dual_encoder_config, + is_training, + train_mode, + input_ids, + input_mask, + masked_lm_positions, + masked_lm_ids, + masked_lm_weights, + use_one_hot_embeddings, + debugging=False): + """Get the sentence encodings, mask ids and masked word LM loss. + + Args: + sent_index: The index of the current looped sentence. + input_sent_reps_doc: The representations of all sentences in the doc + learned by BERT. + input_mask_doc_level: The document level input masks, which indicates + whether a sentence is a real sentence or a padded sentence. + masked_lm_loss_doc: The sum of all the masked word LM loss. + masked_lm_example_loss_doc: The per example masked word LM loss. + masked_lm_weights_doc: the weights of the maksed LM words. If the position + is corresponding to a real masked word, it is 1.0; It is a padded mask, + the weight is 0. + dual_encoder_config: The config of the dual encoder. + is_training: Whether it is in the training mode. + train_mode: string. The train mode which can be finetune, joint_train, or + pretrain. + input_ids: The ids of the input tokens. + input_mask: The mask of the input tokens. + masked_lm_positions: The positions of the masked words in the language + model training. + masked_lm_ids: The ids of the masked words in LM model training. + masked_lm_weights: The weights of the masked words in LM model training. + use_one_hot_embeddings: Whether use one hot embedding. It should be true + for the runs on TPUs. + debugging: bool. Whether it is in the debugging mode. + + Returns: + A list of tensors on the learned sentence representations and the masked + word LM loss. + """ + # Collect token information for the current sentence. + bert_config = modeling.BertConfig.from_json_file( + dual_encoder_config.encoder_config.bert_config_file) + max_sent_length_by_word = dual_encoder_config.encoder_config.max_sent_length_by_word + sent_bert_trainable = dual_encoder_config.encoder_config.sent_bert_trainable + max_predictions_per_seq = dual_encoder_config.encoder_config.max_predictions_per_seq + sent_start = sent_index * max_sent_length_by_word + input_ids_cur_sent = tf.slice(input_ids, [0, sent_start], + [-1, max_sent_length_by_word]) + # Output shape: [batch, max_sent_length_by_word]. + input_mask_cur_sent = tf.slice(input_mask, [0, sent_start], + [-1, max_sent_length_by_word]) + # Output Shape: [batch]. + input_mask_cur_sent_max = tf.reduce_max(input_mask_cur_sent, 1) + # Output Shape: [loop_sent_number_per_doc, batch]. + input_mask_doc_level.append(input_mask_cur_sent_max) + if debugging: + input_ids_cur_sent = tf.Print( + input_ids_cur_sent, [input_ids_cur_sent, input_mask_cur_sent], + message="input_ids_cur_sent in get_sent_reps_masks_lm_loss", + summarize=20) + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids_cur_sent, + input_mask=input_mask_cur_sent, + use_one_hot_embeddings=use_one_hot_embeddings, + sent_bert_trainable=sent_bert_trainable) + with tf.variable_scope("seq_rep_from_bert_sent_dense", reuse=tf.AUTO_REUSE): + normalized_siamese_input_tensor = get_seq_rep_from_bert(model) + input_sent_reps_doc.append(normalized_siamese_input_tensor) + + if (train_mode == constants.TRAIN_MODE_PRETRAIN or + train_mode == constants.TRAIN_MODE_JOINT_TRAIN): + # Collect masked token information for the current sentence. + sent_mask_lm_token_start = sent_index * max_predictions_per_seq + # Output shape: [batch, max_predictions_per_seq]. + masked_lm_positions_cur_sent = tf.slice(masked_lm_positions, + [0, sent_mask_lm_token_start], + [-1, max_predictions_per_seq]) + masked_lm_ids_cur_sent = tf.slice(masked_lm_ids, + [0, sent_mask_lm_token_start], + [-1, max_predictions_per_seq]) + masked_lm_weights_cur_sent = tf.slice(masked_lm_weights, + [0, sent_mask_lm_token_start], + [-1, max_predictions_per_seq]) + # Since in the processed data of smith model, the masked lm positions are + # global indices started from the 1st token of the whole sequence, we need + # to transform this global position to a local position for the current + # sentence. The position index is started from 0. + # Local_index = global_index mod max_sent_length_by_word. + masked_lm_positions_cur_sent = tf.mod(masked_lm_positions_cur_sent, + max_sent_length_by_word) + # Shape of masked_lm_loss_cur_sent [1]. + # Shape of masked_lm_example_loss_cur_sent is [batch, + # max_predictions_per_seq]. + (masked_lm_loss_cur_sent, masked_lm_example_loss_cur_sent, + _) = get_masked_lm_output(bert_config, model.get_sequence_output(), + model.get_embedding_table(), + masked_lm_positions_cur_sent, + masked_lm_ids_cur_sent, + masked_lm_weights_cur_sent) + # Output Shape: [1]. + masked_lm_loss_doc += masked_lm_loss_cur_sent + # Output Shape: [loop_sent_number_per_doc, batch * max_predictions_per_seq]. + masked_lm_example_loss_doc.append(masked_lm_example_loss_cur_sent) + # Output Shape: [loop_sent_number_per_doc, batch, max_predictions_per_seq]. + masked_lm_weights_doc.append(masked_lm_weights_cur_sent) + return (input_sent_reps_doc, input_mask_doc_level, masked_lm_loss_doc, + masked_lm_example_loss_doc, masked_lm_weights_doc) + + +def learn_sent_reps_normal_loop(dual_encoder_config, is_training, train_mode, + input_ids_1, input_mask_1, + masked_lm_positions_1, masked_lm_ids_1, + masked_lm_weights_1, input_ids_2, input_mask_2, + masked_lm_positions_2, masked_lm_ids_2, + masked_lm_weights_2, use_one_hot_embeddings): + """Learn the sentence representations with normal loop functions.""" + input_sent_reps_doc_1 = [] + # Generate document level input masks on each sentence based on the word + # level input mask information. + input_mask_doc_level_1 = [] + masked_lm_loss_doc_1 = 0.0 + masked_lm_example_loss_doc_1 = [] + masked_lm_weights_doc_1 = [] + + input_mask_doc_level_2 = [] + input_sent_reps_doc_2 = [] + masked_lm_loss_doc_2 = 0.0 + masked_lm_example_loss_doc_2 = [] + masked_lm_weights_doc_2 = [] + + # Learn the representation for each sentence in the document. + # Setting smaller number of loop_sent_number_per_doc can save memory for the + # model training. + # Shape of masked_lm_loss_doc_1 [1]. + # Shape of masked_lm_example_loss_doc_1 is [max_doc_length_by_sentence, + # batch * max_predictions_per_seq]. + for sent_index in range( + 0, dual_encoder_config.encoder_config.loop_sent_number_per_doc): + (input_sent_reps_doc_1, input_mask_doc_level_1, masked_lm_loss_doc_1, + masked_lm_example_loss_doc_1, + masked_lm_weights_doc_1) = get_sent_reps_masks_normal_loop( + sent_index, input_sent_reps_doc_1, input_mask_doc_level_1, + masked_lm_loss_doc_1, masked_lm_example_loss_doc_1, + masked_lm_weights_doc_1, dual_encoder_config, is_training, train_mode, + input_ids_1, input_mask_1, masked_lm_positions_1, masked_lm_ids_1, + masked_lm_weights_1, use_one_hot_embeddings) + (input_sent_reps_doc_2, input_mask_doc_level_2, masked_lm_loss_doc_2, + masked_lm_example_loss_doc_2, + masked_lm_weights_doc_2) = get_sent_reps_masks_normal_loop( + sent_index, input_sent_reps_doc_2, input_mask_doc_level_2, + masked_lm_loss_doc_2, masked_lm_example_loss_doc_2, + masked_lm_weights_doc_2, dual_encoder_config, is_training, train_mode, + input_ids_2, input_mask_2, masked_lm_positions_2, masked_lm_ids_2, + masked_lm_weights_2, use_one_hot_embeddings) + + # Stack the sentence representations to learn the doc representations. + # Output Shape: [batch, loop_sent_number_per_doc, hidden]. + input_sent_reps_doc_1_unmask = tf.stack(input_sent_reps_doc_1, axis=1) + input_sent_reps_doc_2_unmask = tf.stack(input_sent_reps_doc_2, axis=1) + + # Output Shape: [batch, loop_sent_number_per_doc]. + input_mask_doc_level_1_tensor = tf.stack(input_mask_doc_level_1, axis=1) + input_mask_doc_level_2_tensor = tf.stack(input_mask_doc_level_2, axis=1) + + if (train_mode == constants.TRAIN_MODE_PRETRAIN or + train_mode == constants.TRAIN_MODE_JOINT_TRAIN): + # Output Shape: [batch * max_predictions_per_seq, + # loop_sent_number_per_doc]. + masked_lm_example_loss_doc_1 = tf.stack( + masked_lm_example_loss_doc_1, axis=1) + masked_lm_example_loss_doc_2 = tf.stack( + masked_lm_example_loss_doc_2, axis=1) + + # Output Shape: [batch, loop_sent_number_per_doc, max_predictions_per_seq]. + masked_lm_weights_doc_1 = tf.stack(masked_lm_weights_doc_1, axis=1) + masked_lm_weights_doc_2 = tf.stack(masked_lm_weights_doc_2, axis=1) + else: + masked_lm_example_loss_doc_1 = tf.zeros([1]) + masked_lm_example_loss_doc_2 = tf.zeros([1]) + masked_lm_weights_doc_1 = tf.zeros([1]) + masked_lm_weights_doc_2 = tf.zeros([1]) + + return (input_sent_reps_doc_1_unmask, input_mask_doc_level_1_tensor, + input_sent_reps_doc_2_unmask, input_mask_doc_level_2_tensor, + masked_lm_loss_doc_1, masked_lm_loss_doc_2, + masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2, + masked_lm_weights_doc_1, masked_lm_weights_doc_2) + diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py new file mode 100644 index 000000000..a2373128f --- /dev/null +++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py @@ -0,0 +1,491 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Dual encoder SMITH models.""" +from npu_bridge.npu_init import * + +import tensorflow.compat.v1 as tf + +from smith import constants +from smith import layers +from smith import loss_fns +from smith import metric_fns +from smith import utils +from smith.bert import modeling +from smith.bert import optimization + +# Add by:TC +import precision_tool.tf_config as npu_tf_config + +def build_smith_dual_encoder(dual_encoder_config, + train_mode, + is_training, + input_ids_1, + input_mask_1, + masked_lm_positions_1, + masked_lm_ids_1, + masked_lm_weights_1, + input_ids_2, + input_mask_2, + masked_lm_positions_2, + masked_lm_ids_2, + masked_lm_weights_2, + use_one_hot_embeddings, + documents_match_labels, + debugging=False): + """Build the dual encoder SMITH model. + + Args: + dual_encoder_config: the configuration file for the dual encoder model. + train_mode: string. The train mode of the current. It can be finetune, + pretrain or joint_train. + is_training: bool. Whether it in training mode. + input_ids_1: int Tensor with shape [batch, max_seq_length]. The input ids of + input examples of text 1. + input_mask_1: int Tensor with shape [batch, max_seq_length]. The input masks + of input examples of text 1. + masked_lm_positions_1: int Tensor with shape [batch, + max_predictions_per_seq]. The input masked LM prediction positions of + input examples of text 1. This can be useful to compute the masked word + prediction LM loss. + masked_lm_ids_1: int Tensor with shape [batch, max_predictions_per_seq]. The + input masked LM prediction ids of input examples of text 1. It is the + ground truth in the masked word LM prediction task. This can be useful to + compute the masked word prediction LM loss. + masked_lm_weights_1: float Tensor with shape [batch, + max_predictions_per_seq]. The input masked LM prediction weights of input + examples of text 1. + input_ids_2: int Tensor with shape [batch, max_seq_length]. The input ids of + input examples of text 2. + input_mask_2: int Tensor with shape [batch, max_seq_length]. The input masks + of input examples of text 2. + masked_lm_positions_2: int Tensor with shape [batch, + max_predictions_per_seq]. The input masked LM prediction positions of + input examples of text 2. This can be useful to compute the masked word + prediction LM loss. + masked_lm_ids_2: int Tensor with shape [batch, max_predictions_per_seq]. The + input masked LM prediction ids of input examples of text 2. It is the + ground truth in the masked word LM prediction task. This can be useful to + compute the masked word prediction LM loss. + masked_lm_weights_2: float Tensor with shape [batch, + max_predictions_per_seq]. The input masked LM prediction weights of input + examples of text 2. + use_one_hot_embeddings: bool. Whether use one hot embeddings. + documents_match_labels: float Tensor with shape [batch]. The ground truth + labels for the input examples. + debugging: bool. Whether it is in the debugging mode. + + Returns: + The masked LM loss, per example LM loss, masked sentence LM loss, per + example masked sentence LM loss, sequence representations, text matching + loss, per example text matching loss, text matching logits, text matching + probabilities and text matching log probabilities. + + Raises: + ValueError: if the doc_rep_combine_mode in dual_encoder_config is invalid. + """ + bert_config = modeling.BertConfig.from_json_file( + dual_encoder_config.encoder_config.bert_config_file) + doc_bert_config = modeling.BertConfig.from_json_file( + dual_encoder_config.encoder_config.doc_bert_config_file) + (input_sent_reps_doc_1_unmask, input_mask_doc_level_1_tensor, + input_sent_reps_doc_2_unmask, input_mask_doc_level_2_tensor, + masked_lm_loss_doc_1, masked_lm_loss_doc_2, masked_lm_example_loss_doc_1, + masked_lm_example_loss_doc_2, masked_lm_weights_doc_1, + masked_lm_weights_doc_2) = layers.learn_sent_reps_normal_loop( + dual_encoder_config, is_training, train_mode, input_ids_1, input_mask_1, + masked_lm_positions_1, masked_lm_ids_1, masked_lm_weights_1, input_ids_2, + input_mask_2, masked_lm_positions_2, masked_lm_ids_2, + masked_lm_weights_2, use_one_hot_embeddings) + if debugging: + input_mask_doc_level_1_tensor = tf.Print( + input_mask_doc_level_1_tensor, + [input_mask_doc_level_1_tensor, input_mask_doc_level_2_tensor], + message="input_mask_doc_level_1_tensor in build_smith_dual_encoder", + summarize=30) + + if dual_encoder_config.encoder_config.use_masked_sentence_lm_loss: + batch_size_static = ( + dual_encoder_config.train_eval_config.train_batch_size if is_training + else dual_encoder_config.train_eval_config.eval_batch_size) + # Generates the sentence masked document represenations. + with tf.variable_scope("mask_sent_in_doc", reuse=tf.AUTO_REUSE): + # Randomly initialize a masked sentence vector and reuse it. + # We also need to return the masked sentence position index to get the + # ground truth labels for the masked positions. The shape of + # sent_mask_embedding is [hidden]. + sent_mask_embedding = tf.get_variable( + name="sentence_mask_embedding", + shape=[bert_config.hidden_size], + initializer=tf.truncated_normal_initializer( + stddev=bert_config.initializer_range)) + # Output Shape: [batch, loop_sent_number_per_doc, hidden]. + (input_sent_reps_doc_1_masked, masked_sent_index_1, + masked_sent_weight_1) = layers.get_doc_rep_with_masked_sent( + input_sent_reps_doc=input_sent_reps_doc_1_unmask, + sent_mask_embedding=sent_mask_embedding, + input_mask_doc_level=input_mask_doc_level_1_tensor, + batch_size_static=batch_size_static, + max_masked_sent_per_doc=dual_encoder_config.encoder_config + .max_masked_sent_per_doc, + loop_sent_number_per_doc=dual_encoder_config.encoder_config + .loop_sent_number_per_doc) + (input_sent_reps_doc_2_masked, masked_sent_index_2, + masked_sent_weight_2) = layers.get_doc_rep_with_masked_sent( + input_sent_reps_doc=input_sent_reps_doc_2_unmask, + sent_mask_embedding=sent_mask_embedding, + input_mask_doc_level=input_mask_doc_level_2_tensor, + batch_size_static=batch_size_static, + max_masked_sent_per_doc=dual_encoder_config.encoder_config + .max_masked_sent_per_doc, + loop_sent_number_per_doc=dual_encoder_config.encoder_config + .loop_sent_number_per_doc) + # Learn the document representations based on masked sentence embeddings. + # Note that the variables in the DocBert model are not within the + # "mask_sent_in_doc" variable scope. + model_doc_1 = modeling.DocBertModel( + config=doc_bert_config, + is_training=is_training, + input_reps=input_sent_reps_doc_1_masked, + input_mask=input_mask_doc_level_1_tensor) + model_doc_2 = modeling.DocBertModel( + config=doc_bert_config, + is_training=is_training, + input_reps=input_sent_reps_doc_2_masked, + input_mask=input_mask_doc_level_2_tensor) + # Shape of masked_sent_lm_loss_1 [1]. + # Shape of masked_sent_lm_example_loss_1 is [batch * + # max_predictions_per_seq]. + (masked_sent_lm_loss_1, masked_sent_per_example_loss_1, + _) = layers.get_masked_sent_lm_output(doc_bert_config, + model_doc_1.get_sequence_output(), + input_sent_reps_doc_1_unmask, + masked_sent_index_1, + masked_sent_weight_1) + (masked_sent_lm_loss_2, masked_sent_per_example_loss_2, + _) = layers.get_masked_sent_lm_output(doc_bert_config, + model_doc_2.get_sequence_output(), + input_sent_reps_doc_2_unmask, + masked_sent_index_2, + masked_sent_weight_2) + else: + # Learn the document representations based on unmasked sentence embeddings. + model_doc_1 = modeling.DocBertModel( + config=doc_bert_config, + is_training=is_training, + input_reps=input_sent_reps_doc_1_unmask, + input_mask=input_mask_doc_level_1_tensor) + model_doc_2 = modeling.DocBertModel( + config=doc_bert_config, + is_training=is_training, + input_reps=input_sent_reps_doc_2_unmask, + input_mask=input_mask_doc_level_2_tensor) + masked_sent_lm_loss_1 = 0 + masked_sent_lm_loss_2 = 0 + masked_sent_per_example_loss_1 = tf.zeros(1) + masked_sent_per_example_loss_2 = tf.zeros(1) + masked_sent_weight_1 = tf.zeros(1) + masked_sent_weight_2 = tf.zeros(1) + + with tf.variable_scope("seq_rep_from_bert_doc_dense", reuse=tf.AUTO_REUSE): + normalized_doc_rep_1 = layers.get_seq_rep_from_bert(model_doc_1) + normalized_doc_rep_2 = layers.get_seq_rep_from_bert(model_doc_2) + + # We also dump the contextualized sentence embedding output by document + # level Transformer model. These representations maybe useful for sentence + # level tasks. + output_sent_reps_doc_1 = model_doc_1.get_sequence_output() + output_sent_reps_doc_2 = model_doc_2.get_sequence_output() + + # Here we support multiple modes to generate the final document + # representations based on the word/sentence/document level representations + # 1. normal: only use the document level representation as the final document + # representations. + # 2. sum_concat: firstly compute the sum of all sentence level repsentations. + # Then concatenate the sum vector with the document level representations. + # 3. mean_concat: firstly compute the mean of all sentence level + # repsentations. Then concatenate the mean vector with the document level + # representations. + # 4. attention: firstly compute the weighted sum of sentence level + # representations with attention mechanism, then concatenate the weighted sum + # vector with the document level representations. + # The document level mask is to indicate whether each sentence is + # a real sentence (1) or a paded sentence (0). The shape of + # input_mask_doc_level_1_tensor is [batch, max_doc_length_by_sentence]. The + # shape of input_sent_reps_doc_1_unmask is + # [batch, max_doc_length_by_sentence, hidden]. + final_doc_rep_combine_mode = dual_encoder_config.encoder_config.doc_rep_combine_mode + if final_doc_rep_combine_mode == constants.DOC_COMBINE_NORMAL: + final_doc_rep_1 = normalized_doc_rep_1 + final_doc_rep_2 = normalized_doc_rep_2 + elif final_doc_rep_combine_mode == constants.DOC_COMBINE_SUM_CONCAT: + # Output Shape: [batch, 2*hidden]. + final_doc_rep_1 = tf.concat( + [tf.reduce_sum(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1], + axis=1) + final_doc_rep_2 = tf.concat( + [tf.reduce_sum(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2], + axis=1) + elif final_doc_rep_combine_mode == constants.DOC_COMBINE_MEAN_CONCAT: + final_doc_rep_1 = tf.concat( + [tf.reduce_mean(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1], + axis=1) + final_doc_rep_2 = tf.concat( + [tf.reduce_mean(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2], + axis=1) + elif final_doc_rep_combine_mode == constants.DOC_COMBINE_ATTENTION: + final_doc_rep_1 = tf.concat([ + layers.get_attention_weighted_sum( + input_sent_reps_doc_1_unmask, bert_config, is_training, + dual_encoder_config.encoder_config.doc_rep_combine_attention_size), + normalized_doc_rep_1 + ], + axis=1) + final_doc_rep_2 = tf.concat([ + layers.get_attention_weighted_sum( + input_sent_reps_doc_2_unmask, bert_config, is_training, + dual_encoder_config.encoder_config.doc_rep_combine_attention_size), + normalized_doc_rep_2 + ], + axis=1) + else: + raise ValueError("Only normal, sum_concat, mean_concat and attention are" + " supported: %s" % final_doc_rep_combine_mode) + (siamese_loss, siamese_example_loss, + siamese_logits) = loss_fns.get_prediction_loss_cosine( + input_tensor_1=final_doc_rep_1, + input_tensor_2=final_doc_rep_2, + labels=documents_match_labels, + similarity_score_amplifier=dual_encoder_config.loss_config + .similarity_score_amplifier, + neg_to_pos_example_ratio=dual_encoder_config.train_eval_config + .neg_to_pos_example_ratio) + + # The shape of masked_lm_loss_doc is [1]. + # The shape of masked_lm_example_loss_doc is [batch * max_predictions_per_seq, + # max_doc_length_by_sentence]. + return (masked_lm_loss_doc_1, masked_lm_loss_doc_2, + masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2, + masked_lm_weights_doc_1, masked_lm_weights_doc_2, + masked_sent_lm_loss_1, masked_sent_lm_loss_2, + masked_sent_per_example_loss_1, masked_sent_per_example_loss_2, + masked_sent_weight_1, masked_sent_weight_2, final_doc_rep_1, + final_doc_rep_2, input_sent_reps_doc_1_unmask, + input_sent_reps_doc_2_unmask, output_sent_reps_doc_1, + output_sent_reps_doc_2, siamese_loss, siamese_example_loss, + siamese_logits) + + +def model_fn_builder(dual_encoder_config, + train_mode, + learning_rate, + num_train_steps, + num_warmup_steps, + use_tpu, + use_one_hot_embeddings, + debugging=False): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + tf.logging.info("*** Current mode: %s ***" % mode) + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids_1 = features["input_ids_1"] + input_mask_1 = features["input_mask_1"] + if train_mode == constants.TRAIN_MODE_FINETUNE: + masked_lm_positions_1 = tf.zeros([1]) + masked_lm_ids_1 = tf.zeros([1]) + masked_lm_weights_1 = tf.zeros([1]) + else: + masked_lm_positions_1 = features["masked_lm_positions_1"] + masked_lm_ids_1 = features["masked_lm_ids_1"] + masked_lm_weights_1 = features["masked_lm_weights_1"] + + input_ids_2 = features["input_ids_2"] + input_mask_2 = features["input_mask_2"] + if train_mode == constants.TRAIN_MODE_FINETUNE: + masked_lm_positions_2 = tf.zeros([1]) + masked_lm_ids_2 = tf.zeros([1]) + masked_lm_weights_2 = tf.zeros([1]) + else: + masked_lm_positions_2 = features["masked_lm_positions_2"] + masked_lm_ids_2 = features["masked_lm_ids_2"] + masked_lm_weights_2 = features["masked_lm_weights_2"] + documents_match_labels = features["documents_match_labels"] + # Since the document_match_labels might contain labels like 0/1/2, we need + # to transfer these labels to binary labels like 0/1. + documents_match_labels = tf.cast(documents_match_labels > 0, tf.float32) + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones( + tf.shape(documents_match_labels), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + if (dual_encoder_config.encoder_config.model_name == + constants.MODEL_NAME_SMITH_DUAL_ENCODER): + # For the smith model, since the actual looped number of sentences per + # document maybe smaller than max_doc_length_by_sentence, we need to + # overwrite the lm weights with the actual lm weights returned by the + # function. + (masked_lm_loss_1, masked_lm_loss_2, masked_lm_example_loss_1, + masked_lm_example_loss_2, masked_lm_weights_1, masked_lm_weights_2, + masked_sent_lm_loss_1, masked_sent_lm_loss_2, + masked_sent_per_example_loss_1, masked_sent_per_example_loss_2, + masked_sent_weight_1, masked_sent_weight_2, seq_embed_1, seq_embed_2, + input_sent_embed_1, input_sent_embed_2, output_sent_embed_1, + output_sent_embed_2, siamese_loss, + siamese_example_loss, siamese_logits) = build_smith_dual_encoder( + dual_encoder_config, train_mode, is_training, input_ids_1, + input_mask_1, masked_lm_positions_1, masked_lm_ids_1, + masked_lm_weights_1, input_ids_2, input_mask_2, + masked_lm_positions_2, masked_lm_ids_2, masked_lm_weights_2, + use_one_hot_embeddings, documents_match_labels, debugging) + else: + raise ValueError( + "Only smith_dual_encoder is supported: %s" % + dual_encoder_config.encoder_config.model_name) + + # There are three different modes for training in the smith model. + # 1. joint_train: a multi-task learning setting which combines the masked + # word LM losses for doc1/doc2 and the siamese matching loss. If we add the + # masked sentence LM task, we also add the masked sentence LM losses for + # the two documents. + # 2. pretrain: only contains the masked word LM losses for doc1/doc2. We + # currently didn't include the NSP loss since NSP loss is not very useful + # according to the XLNet/ RoBERTa/ ALBERT paper. If we add the masked + # sentence LM task, we also add the masked sentence LM losses for the + # two documents. + # 3. finetune: fine tune the model with loaded pretrained checkpoint only + # with the siamese matching loss. If we add the masked sentence LM task, + # we also add the masked sentence LM losses for the two documents. + if train_mode == constants.TRAIN_MODE_JOINT_TRAIN: + total_loss = masked_lm_loss_1 + masked_lm_loss_2 + siamese_loss + elif train_mode == constants.TRAIN_MODE_PRETRAIN: + total_loss = masked_lm_loss_1 + masked_lm_loss_2 + elif train_mode == constants.TRAIN_MODE_FINETUNE: + total_loss = siamese_loss + else: + raise ValueError("Only joint_train, pretrain, finetune are supported.") + # If we add the masked sentence LM task, we also add the masked sentence + # LM losses for the two documents. + if dual_encoder_config.encoder_config.use_masked_sentence_lm_loss: + total_loss += (masked_sent_lm_loss_1 + masked_sent_lm_loss_2) + + total_loss = tf.identity(total_loss, name='total_loss') + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + init_checkpoint = dual_encoder_config.encoder_config.init_checkpoint + # Load pretrained BERT checkpoints if there is a specified path. + if init_checkpoint: + tf.logging.info("**** Passed pretrained BERT checkpoint = %s ****", + init_checkpoint) + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = ", *INIT_RANDOMLY*" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + output_spec = None + predicted_score = tf.sigmoid(siamese_logits) + predicted_class = tf.round(predicted_score) + + if dual_encoder_config.encoder_config.model_name == constants.MODEL_NAME_SMITH_DUAL_ENCODER: + _, prediction_dict = utils.get_export_outputs_prediction_dict_smith_de( + seq_embed_1, seq_embed_2, predicted_score, predicted_class, + documents_match_labels, input_sent_embed_1, input_sent_embed_2, + output_sent_embed_1, output_sent_embed_2) + else: + raise ValueError("Unsupported model: %s" % dual_encoder_config.encoder_config.model_name) + + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer(total_loss, learning_rate, + num_train_steps, + num_warmup_steps, use_tpu) + # Add by:TC 20220705 + output_spec = tf.estimator.EstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + training_hooks=[npu_tf_config.estimator_dump()]) + + elif mode == tf.estimator.ModeKeys.EVAL: + if (train_mode == constants.TRAIN_MODE_JOINT_TRAIN or + train_mode == constants.TRAIN_MODE_PRETRAIN): + eval_metrics = (metric_fns.metric_fn_pretrain, [ + masked_lm_example_loss_1, masked_lm_weights_1, + masked_sent_per_example_loss_1, masked_sent_weight_1, + masked_lm_example_loss_2, masked_lm_weights_2, + masked_sent_per_example_loss_2, masked_sent_weight_2, + predicted_class, documents_match_labels, is_real_example + ]) + elif train_mode == constants.TRAIN_MODE_FINETUNE: + eval_metrics = (metric_fns.metric_fn_finetune, [ + predicted_class, documents_match_labels, siamese_example_loss, + is_real_example + ]) + else: + raise ValueError("Only joint_train, pretrain, finetune are supported.") + output_spec = tf.estimator.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + + elif mode == tf.estimator.ModeKeys.PREDICT: + output_spec = tf.estimator.tpu.TPUEstimatorSpec( + mode=mode, predictions=prediction_dict, scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN, EVAL, PREDICT modes are supported: %s" % mode) + + return output_spec + + return model_fn + diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py new file mode 100644 index 000000000..2818aabc8 --- /dev/null +++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py @@ -0,0 +1,547 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Library to preprocess text data into SMITH dual encoder model inputs.""" +from npu_bridge.npu_init import * +import collections +import random +import nltk +import tensorflow.compat.v1 as tf +import tqdm +from smith import utils +from smith import wiki_doc_pair_pb2 +from smith.bert import tokenization + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, "Input data path.") + +flags.DEFINE_string( + "output_file", None, + "Output TF examples (or comma-separated list of files) in TFRecord " + "files.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the SMITH model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool("add_masks_lm", True, + "If true, add masks for word prediction LM pre-training.") + +flags.DEFINE_integer( + "max_sent_length_by_word", 32, "The maximum length of a sentence by tokens." + "A sentence will be cut off if longer than this length, and will be padded " + "if shorter than it. The sentence can also be a sentence block.") + +flags.DEFINE_integer( + "max_doc_length_by_sentence", 64, + "The maximum length of a document by sentences. A " + "document will be cut off if longer than this length, and" + "will be padded if shorter than it.") + +flags.DEFINE_bool( + "greedy_sentence_filling", True, + "If true, apply the greedy sentence filling trick to reduce the " + "number of padded tokens.") + +flags.DEFINE_integer("max_predictions_per_seq", 5, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + + +class TrainingInstance(object): + """A single training instance (sentence pair as dual encoder model inputs).""" + + def __init__(self, + tokens_1, + segment_ids_1, + masked_lm_positions_1, + masked_lm_labels_1, + input_mask_1, + masked_lm_weights_1, + tokens_2, + segment_ids_2, + masked_lm_positions_2, + masked_lm_labels_2, + input_mask_2, + masked_lm_weights_2, + instance_id, + documents_match_labels=-1.0): + self.tokens_1 = tokens_1 + self.segment_ids_1 = segment_ids_1 + self.masked_lm_positions_1 = masked_lm_positions_1 + self.masked_lm_labels_1 = masked_lm_labels_1 + self.input_mask_1 = input_mask_1 + self.masked_lm_weights_1 = masked_lm_weights_1 + self.tokens_2 = tokens_2 + self.segment_ids_2 = segment_ids_2 + self.masked_lm_positions_2 = masked_lm_positions_2 + self.masked_lm_labels_2 = masked_lm_labels_2 + self.input_mask_2 = input_mask_2 + self.masked_lm_weights_2 = masked_lm_weights_2 + self.instance_id = instance_id + self.documents_match_labels = documents_match_labels + + def __str__(self): + s = "" + s += "instance_id: %s\n" % self.instance_id + s += "documents_match_labels: %s\n" % (str(self.documents_match_labels)) + s += "tokens_1: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens_1])) + s += "segment_ids_1: %s\n" % (" ".join([str(x) for x in self.segment_ids_1 + ])) + s += "masked_lm_positions_1: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions_1])) + s += "masked_lm_labels_1: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels_1])) + s += "input_mask_1: %s\n" % (" ".join([str(x) for x in self.input_mask_1])) + s += "masked_lm_weights_1: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_weights_1])) + s += "tokens_2: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens_2])) + s += "segment_ids_2: %s\n" % (" ".join([str(x) for x in self.segment_ids_2 + ])) + s += "masked_lm_positions_2: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions_2])) + s += "masked_lm_labels_2: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels_2])) + s += "input_mask_2: %s\n" % (" ".join([str(x) for x in self.input_mask_2])) + s += "masked_lm_weights_2: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_weights_2])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def add_features_for_one_doc(features, tokens, segment_ids, input_mask, + masked_lm_positions, masked_lm_labels, + masked_lm_weights, tokenizer, doc_index): + """Add features for one document in a WikiDocPair example.""" + input_ids = tokenizer.convert_tokens_to_ids(tokens) + features["input_ids_" + doc_index] = utils.create_int_feature(input_ids) + features["input_mask_" + doc_index] = utils.create_int_feature(input_mask) + features["segment_ids_" + doc_index] = utils.create_int_feature(segment_ids) + + if masked_lm_labels: + masked_lm_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels) + features["masked_lm_positions_" + + doc_index] = utils.create_int_feature(masked_lm_positions) + features["masked_lm_ids_" + + doc_index] = utils.create_int_feature(masked_lm_ids) + features["masked_lm_weights_" + + doc_index] = utils.create_float_feature(masked_lm_weights) + + +def write_instance_to_example_files(instances, tokenizer, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) + writer_index = 0 + total_written = 0 + for (inst_index, instance) in enumerate(instances): + features = collections.OrderedDict() + add_features_for_one_doc( + features=features, + tokens=instance.tokens_1, + segment_ids=instance.segment_ids_1, + input_mask=instance.input_mask_1, + masked_lm_positions=instance.masked_lm_positions_1, + masked_lm_labels=instance.masked_lm_labels_1, + masked_lm_weights=instance.masked_lm_weights_1, + tokenizer=tokenizer, + doc_index="1") + add_features_for_one_doc( + features=features, + tokens=instance.tokens_2, + segment_ids=instance.segment_ids_2, + input_mask=instance.input_mask_2, + masked_lm_positions=instance.masked_lm_positions_2, + masked_lm_labels=instance.masked_lm_labels_2, + masked_lm_weights=instance.masked_lm_weights_2, + tokenizer=tokenizer, + doc_index="2") + # Adds fields on more content/id information of the current example. + features["instance_id"] = utils.create_bytes_feature( + [bytes(instance.instance_id, "utf-8")]) + features["tokens_1"] = utils.create_bytes_feature( + [bytes(t, "utf-8") for t in instance.tokens_1]) + features["tokens_2"] = utils.create_bytes_feature( + [bytes(t, "utf-8") for t in instance.tokens_2]) + # Adds the documents matching labels. + features["documents_match_labels"] = utils.create_float_feature( + [float(instance.documents_match_labels)]) + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info( + "tokens_1: %s" % + " ".join([tokenization.printable_text(x) for x in instance.tokens_1])) + tf.logging.info( + "tokens_2: %s" % + " ".join([tokenization.printable_text(x) for x in instance.tokens_2])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + elif feature.bytes_list.value: + values = feature.bytes_list.value + tf.logging.info("%s: %s" % + (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.logging.info("Wrote %d total instances", total_written) + + +def get_smith_model_tokens(input_text, tokenizer, sent_token_counter): + """Generate tokens given an input text for the SMITH model.""" + res_tokens = [] + for sent in nltk.tokenize.sent_tokenize(input_text): + # The returned res_tokens is a 2D list to maintain the sentence boundary + # information. We removed all the empty tokens in this step. + if not sent: + continue + tokens = [w for w in tokenizer.tokenize(sent) if w] + sent_token_counter[0] += 1 # Track number of sentences. + sent_token_counter[1] += len(tokens) # Track number of tokens. + res_tokens.append(tokens) + return (res_tokens, sent_token_counter) + + +def create_training_instances_wiki_doc_pair( + input_file, tokenizer, max_sent_length_by_word, max_doc_length_by_sentence, + masked_lm_prob, max_predictions_per_seq, rng): + """Create `TrainingInstance`s from WikiDocPair proto data.""" + # The input data is in the WikiDocPair proto format in tfrecord. + # Add by:TC + wiki_doc_pair = wiki_doc_pair_pb2.WikiDocPair() + instances = [] + # Add some counters to track some data statistics. + sent_token_counter = [0, 0] + for example in tqdm.tqdm(tf.python_io.tf_record_iterator(input_file)): + doc_pair = wiki_doc_pair.FromString(example) + # If model_name = smith_dual_encoder, we firstly use a sentence tokenizer + # to split doc_one/doc_two texts into different sentences and use [SEN] to + # label the sentence boundary information. So in the masking and padding + # step, we know the boundary between different sentences and we can do the + # masking and padding according to the actual length of each sentence. + doc_one_text = " \n\n\n\n\n\n ".join( + [a.text for a in doc_pair.doc_one.section_contents]) + doc_two_text = " \n\n\n\n\n\n ".join( + [a.text for a in doc_pair.doc_two.section_contents]) + doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip() + doc_two_text = tokenization.convert_to_unicode(doc_two_text).strip() + doc_one_tokens, sent_token_counter = get_smith_model_tokens( + doc_one_text, tokenizer, sent_token_counter) + doc_two_tokens, sent_token_counter = get_smith_model_tokens( + doc_two_text, tokenizer, sent_token_counter) + # Skip the document pairs if any document is empty. + if not doc_one_tokens or not doc_two_tokens: + continue + vocab_words = list(tokenizer.vocab.keys()) + instance_id = doc_pair.id + if doc_pair.human_label_for_classification: + doc_match_label = doc_pair.human_label_for_classification + else: + # Set the label as 0.0 if there are no available labels. + doc_match_label = 0.0 + instances.append( + create_instance_from_wiki_doc_pair( + instance_id, doc_match_label, doc_one_tokens, doc_two_tokens, + max_sent_length_by_word, max_doc_length_by_sentence, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng)) + rng.shuffle(instances) + return (instances, sent_token_counter) + + +def create_instance_from_wiki_doc_pair(instance_id, doc_match_label, + doc_one_tokens, doc_two_tokens, + max_sent_length_by_word, + max_doc_length_by_sentence, + masked_lm_prob, max_predictions_per_seq, + vocab_words, rng): + """Creates `TrainingInstance`s for a WikiDocPair input data.""" + (tokens_1, segment_ids_1, masked_lm_positions_1, masked_lm_labels_1, \ + input_mask_1, masked_lm_weights_1) = \ + get_tokens_segment_ids_masks(max_sent_length_by_word, max_doc_length_by_sentence, doc_one_tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng) + (tokens_2, segment_ids_2, masked_lm_positions_2, masked_lm_labels_2, \ + input_mask_2, masked_lm_weights_2) = \ + get_tokens_segment_ids_masks(max_sent_length_by_word, max_doc_length_by_sentence, doc_two_tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens_1=tokens_1, + segment_ids_1=segment_ids_1, + masked_lm_positions_1=masked_lm_positions_1, + masked_lm_labels_1=masked_lm_labels_1, + input_mask_1=input_mask_1, + masked_lm_weights_1=masked_lm_weights_1, + tokens_2=tokens_2, + segment_ids_2=segment_ids_2, + masked_lm_positions_2=masked_lm_positions_2, + masked_lm_labels_2=masked_lm_labels_2, + input_mask_2=input_mask_2, + masked_lm_weights_2=masked_lm_weights_2, + instance_id=instance_id, + documents_match_labels=doc_match_label) + return instance + + +def get_tokens_segment_ids_masks(max_sent_length_by_word, + max_doc_length_by_sentence, doc_one_tokens, + masked_lm_prob, max_predictions_per_seq, + vocab_words, rng): + """Get the tokens, segment ids and masks of an input sequence.""" + # The format of tokens for SMITH dual encoder models is like: + # [CLS] block1_token1 block1_token2 block1_token3 ... [SEP] [SEP] [PAD] ... + # [CLS] block2_token1 block2_token2 block2_token3 ... [SEP] [SEP] [PAD] ... + # [CLS] block3_token1 block3_token2 block3_token3 ... [SEP] [SEP] [PAD] ... + # If max_sent_length_by_word is large, then there will be many padded + # words in the sentence. Here we added an optional "greedy sentence filling" + # trick in order to reduce the number of padded words and maintain all + # content in the document. We allow a "sentence" block to contain more than + # one natural sentence and try to fill as many as sentences into the + # "sentence" block. If a sentence will be cut off and the current sentence + # block is not empty, we will put the sentence into the next "sentence" block. + # According to ALBERT paper and RoBERTa paper, a segment is usually comprised + # of more than one natural sentence, which has been shown to benefit + # performance. doc_one_tokens is a 2D list which contains the sentence + # boundary information. + sentence_num = len(doc_one_tokens) + # sent_block_token_list is a 2D list to maintain sentence block tokens. + sent_block_token_list = [] + natural_sentence_index = -1 + while natural_sentence_index + 1 < sentence_num: + natural_sentence_index += 1 + sent_tokens = doc_one_tokens[natural_sentence_index] + if not sent_tokens: + continue + if FLAGS.greedy_sentence_filling: + cur_sent_block_length = 0 + cur_sent_block = [] + # Fill as many senteces as possible in the current sentence block in a + # greedy way. + while natural_sentence_index < sentence_num: + cur_natural_sent_tokens = doc_one_tokens[natural_sentence_index] + if not cur_natural_sent_tokens: + natural_sentence_index += 1 + continue + cur_sent_len = len(cur_natural_sent_tokens) + if ((cur_sent_block_length + cur_sent_len) <= + (max_sent_length_by_word - 3)) or cur_sent_block_length == 0: + # One exceptional case here is that if the 1st sentence of a sentence + # block is already going across the boundary, then the current + # sentence block will be empty. So when cur_sent_block_length is 0 + # and we meet a natural sentence with length longer than + # (max_sent_length_by_word - 3), we still put this natural sentence + # in the current sentence block. In this case, this long natural + # sentence will be cut off with the final length up to + # (max_sent_length_by_word - 3). + cur_sent_block.extend(cur_natural_sent_tokens) + cur_sent_block_length += cur_sent_len + natural_sentence_index += 1 + else: + # If cur_sent_block_length + cur_sent_len > max_sent_length_by_word-3 + # and the current sentence block is not empty, the sentence which + # goes across the boundary will be put into the next sentence block. + natural_sentence_index -= 1 + break + sent_tokens = cur_sent_block + sent_block_token_list.append(sent_tokens) + if len(sent_block_token_list) >= max_doc_length_by_sentence: + break # Skip more sentence blocks if the document is too long. + # For each sentence block, generate the token sequences, masks and paddings. + tokens_doc = [] + segment_ids_doc = [] + masked_lm_positions_doc = [] + masked_lm_labels_doc = [] + input_mask_doc = [] + masked_lm_weights_doc = [] + for block_index in range(len(sent_block_token_list)): + tokens_block, segment_ids_block, masked_lm_positions_block, \ + masked_lm_labels_block, input_mask_block, masked_lm_weights_block = \ + get_token_masks_paddings( + sent_block_token_list[block_index], + max_sent_length_by_word, + masked_lm_prob, + max_predictions_per_seq, + vocab_words, + rng, + block_index) + tokens_doc.extend(tokens_block) + segment_ids_doc.extend(segment_ids_block) + masked_lm_positions_doc.extend(masked_lm_positions_block) + masked_lm_labels_doc.extend(masked_lm_labels_block) + input_mask_doc.extend(input_mask_block) + masked_lm_weights_doc.extend(masked_lm_weights_block) + + # Pad sentence blocks if the actual number of sentence blocks is less than + # max_doc_length_by_sentence. + sentence_block_index = len(sent_block_token_list) + while sentence_block_index < max_doc_length_by_sentence: + for _ in range(max_sent_length_by_word): + tokens_doc.append("[PAD]") + segment_ids_doc.append(0) + input_mask_doc.append(0) + for _ in range(max_predictions_per_seq): + masked_lm_positions_doc.append(0) + masked_lm_labels_doc.append("[PAD]") + masked_lm_weights_doc.append(0.0) + sentence_block_index += 1 + assert len(tokens_doc) == max_sent_length_by_word * max_doc_length_by_sentence + assert len(masked_lm_labels_doc + ) == max_predictions_per_seq * max_doc_length_by_sentence + return (tokens_doc, segment_ids_doc, masked_lm_positions_doc, + masked_lm_labels_doc, input_mask_doc, masked_lm_weights_doc) + + +def get_token_masks_paddings(block_tokens, max_sent_length_by_word, + masked_lm_prob, max_predictions_per_seq, + vocab_words, rng, block_index): + """Generates tokens, masks and paddings for the input block tokens.""" + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_sent_length_by_word - 3 + # Truncates the sequence if sequence length is longer than max_num_tokens. + tokens = [] + segment_ids = [] + if len(block_tokens) > max_num_tokens: + block_tokens = block_tokens[0:max_num_tokens] + tokens_a = block_tokens + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + masked_lm_positions = [] + masked_lm_labels = [] + masked_lm_weights = [] + if max_predictions_per_seq > 0: + (tokens, masked_lm_positions, + masked_lm_labels) = utils.create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + # Add [PAD] to tokens and masked LM related lists. + input_mask = [1] * len(tokens) + while len(tokens) < max_sent_length_by_word: + tokens.append("[PAD]") + input_mask.append(0) + segment_ids.append(0) + + assert len(tokens) == max_sent_length_by_word + assert len(input_mask) == max_sent_length_by_word + assert len(segment_ids) == max_sent_length_by_word + + if max_predictions_per_seq > 0: + # Transfer local positions in masked_lm_positions to global positions in the + # whole document to be consistent with the model training pipeline. + masked_lm_positions = [ + (i + max_sent_length_by_word * block_index) for i in masked_lm_positions + ] + masked_lm_weights = [1.0] * len(masked_lm_labels) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_labels.append("[PAD]") + masked_lm_weights.append(0.0) + return (tokens, segment_ids, masked_lm_positions, masked_lm_labels, + input_mask, masked_lm_weights) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) + rng = random.Random(FLAGS.random_seed) + # Creates training instances. + max_predictions_per_seq = FLAGS.max_predictions_per_seq if FLAGS.add_masks_lm else 0 + masked_lm_prob = FLAGS.masked_lm_prob if FLAGS.add_masks_lm else 0 + instances, sent_token_counter = create_training_instances_wiki_doc_pair( + input_file=FLAGS.input_file, + tokenizer=tokenizer, + max_sent_length_by_word=FLAGS.max_sent_length_by_word, + max_doc_length_by_sentence=FLAGS.max_doc_length_by_sentence, + masked_lm_prob=masked_lm_prob, + max_predictions_per_seq=max_predictions_per_seq, + rng=rng) + + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) + + # Transfers training instances into tensorflow examples and write the results. + write_instance_to_example_files(instances, tokenizer, output_files) + + # Finally outputs some data statistics. + tf.logging.info("sent_count, token_count, doc_pair_count: %d %d %d", + sent_token_counter[0], sent_token_counter[1], len(instances)) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() + diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test.py index f4b828c75..4a70f3736 100644 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test.py +++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2021 The Google Research Authors. +# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,8 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================== -# Copyright 2021 Huawei Technologies Co., Ltd +# Copyright 2021 The Google Research Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,7 +27,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# ============================================================================== from npu_bridge.npu_init import * import random @@ -35,8 +35,8 @@ import tempfile from absl import flags import tensorflow.compat.v1 as tf -from smith import preprocessing_smith -from smith.bert import tokenization +from smith_npu_20220702105238 import preprocessing_smith +from smith_npu_20220702105238.bert import tokenization FLAGS = flags.FLAGS @@ -72,8 +72,11 @@ class PreprocessingSmithTest(tf.test.TestCase): self.masked_lm_prob = 0 def test_get_tokens_segment_ids_masks(self): - (tokens_1, segment_ids_1, _, _, input_mask_1, _) = \ - preprocessing_smith.get_tokens_segment_ids_masks( + + tokens_segment_ids_masks = preprocessing_smith.GetTokensSegmentIdsMasks() + + tokens_segment_ids_masks_res = \ + tokens_segment_ids_masks.get_tokens_segment_ids_masks( max_sent_length_by_word=self.max_sent_length_by_word, max_doc_length_by_sentence=self.max_doc_length_by_sentence, doc_one_tokens=self.doc_one_tokens, @@ -81,6 +84,10 @@ class PreprocessingSmithTest(tf.test.TestCase): max_predictions_per_seq=self.max_predictions_per_seq, vocab_words=self.vocab_words, rng=self.rng) + + tokens_1, segment_ids_1, _, _, input_mask_1, _ = tokens_segment_ids_masks_res.tokens_doc, tokens_segment_ids_masks_res.segment_ids_doc, tokens_segment_ids_masks_res.masked_lm_positions_doc,\ + tokens_segment_ids_masks_res.masked_lm_labels_doc, tokens_segment_ids_masks_res.input_mask_doc, tokens_segment_ids_masks_res.masked_lm_weights_doc + self.assertEqual(tokens_1, [ "[CLS]", "i", "am", "in", "[UNK]", "[UNK]", "[UNK]", "for", "my", "dinner", "[UNK]", "ok", ",", "no", "problem", "[UNK]", "[SEP]", diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py new file mode 100644 index 000000000..f4b828c75 --- /dev/null +++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py @@ -0,0 +1,108 @@ +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from npu_bridge.npu_init import * +import random +import tempfile + +from absl import flags +import tensorflow.compat.v1 as tf + +from smith import preprocessing_smith +from smith.bert import tokenization + +FLAGS = flags.FLAGS + + +class PreprocessingSmithTest(tf.test.TestCase): + + def setUp(self): + super(PreprocessingSmithTest, self).setUp() + doc_one_text = ( + "I am in Dominick's for my dinner. OK, no problem. I am " + "in Dominick's for my dinner which is the best dinner I have " + "in my whole life.") + doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip() + vocab_tokens = [ + "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "i", "am", "in", "for", + "my", "dinner", "ok", "no", "problem", "which", "is", "the", "be", + "##s", "##t", "," + ] + with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens + ]).encode("utf-8")) + self.vocab_file = vocab_writer.name + self.tokenizer = tokenization.FullTokenizer( + vocab_file=self.vocab_file, do_lower_case=True) + self.vocab_words = list(self.tokenizer.vocab.keys()) + self.rng = random.Random(12345) + self.doc_one_tokens, _ = preprocessing_smith.get_smith_model_tokens( + doc_one_text, self.tokenizer, [0, 0]) + self.max_sent_length_by_word = 20 + self.max_doc_length_by_sentence = 3 + self.greedy_sentence_filling = True + self.max_predictions_per_seq = 0 + self.masked_lm_prob = 0 + + def test_get_tokens_segment_ids_masks(self): + (tokens_1, segment_ids_1, _, _, input_mask_1, _) = \ + preprocessing_smith.get_tokens_segment_ids_masks( + max_sent_length_by_word=self.max_sent_length_by_word, + max_doc_length_by_sentence=self.max_doc_length_by_sentence, + doc_one_tokens=self.doc_one_tokens, + masked_lm_prob=self.masked_lm_prob, + max_predictions_per_seq=self.max_predictions_per_seq, + vocab_words=self.vocab_words, + rng=self.rng) + self.assertEqual(tokens_1, [ + "[CLS]", "i", "am", "in", "[UNK]", "[UNK]", "[UNK]", "for", "my", + "dinner", "[UNK]", "ok", ",", "no", "problem", "[UNK]", "[SEP]", + "[SEP]", "[PAD]", "[PAD]", "[CLS]", "i", "am", "in", "[UNK]", "[UNK]", + "[UNK]", "for", "my", "dinner", "which", "is", "the", "be", "##s", + "##t", "dinner", "i", "[SEP]", "[SEP]", "[PAD]", "[PAD]", "[PAD]", + "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", + "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", + "[PAD]" + ]) + self.assertEqual(segment_ids_1, [ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ]) + self.assertEqual(input_mask_1, [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ]) + + +if __name__ == "__main__": + tf.test.main() + -- Gitee From 6a2f73a8b449dcb90c69c7911c1897c97cf7bf17 Mon Sep 17 00:00:00 2001 From: QiuYao Date: Tue, 27 Sep 2022 20:18:33 +0800 Subject: [PATCH 2/3] add atc code --- .../bert/modeling.py | 76 ++- .../layers_orig.py | 570 ------------------ .../modeling_orig.py | 491 --------------- .../preprocessing_smith_orig.py | 547 ----------------- .../preprocessing_smith_test_orig.py | 108 ---- 5 files changed, 44 insertions(+), 1748 deletions(-) delete mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py delete mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py delete mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py delete mode 100644 TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling.py index 1ca7d4124..567aca741 100644 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling.py +++ b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2021 The Google Research Authors. +# Copyright 2022 The Google Research Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from npu_bridge.npu_init import * import collections import copy @@ -30,6 +29,8 @@ import six from six.moves import range import tensorflow.compat.v1 as tf import tf_slim as slim +from npu_bridge.npu_init import * +from npu_bridge.estimator.npu_unary_ops import npu_unary_ops class BertConfig(object): @@ -42,8 +43,8 @@ class BertConfig(object): num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, + # hidden_dropout_prob=0.1, + # attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, initializer_range=0.02): @@ -77,8 +78,8 @@ class BertConfig(object): self.num_attention_heads = num_attention_heads self.hidden_act = hidden_act self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob + # self.hidden_dropout_prob = hidden_dropout_prob + # self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range @@ -162,9 +163,9 @@ class DocBertModel(object): is invalid. """ config = copy.deepcopy(config) - if not is_training: - config.hidden_dropout_prob = 0.0 - config.attention_probs_dropout_prob = 0.0 + # if not is_training: + # config.hidden_dropout_prob = 0.0 + # config.attention_probs_dropout_prob = 0.0 input_shape = get_shape_list(input_reps, expected_rank=3) batch_size = input_shape[0] @@ -191,8 +192,7 @@ class DocBertModel(object): use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, - max_position_embeddings=config.max_position_embeddings, - dropout_prob=config.hidden_dropout_prob) + max_position_embeddings=config.max_position_embeddings) with tf.variable_scope("doc_encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D @@ -212,8 +212,8 @@ class DocBertModel(object): num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=get_activation(config.hidden_act), - hidden_dropout_prob=config.hidden_dropout_prob, - attention_probs_dropout_prob=config.attention_probs_dropout_prob, + # hidden_dropout_prob=config.hidden_dropout_prob, + # attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) @@ -327,9 +327,9 @@ class BertModel(object): is invalid. """ config = copy.deepcopy(config) - if not is_training: - config.hidden_dropout_prob = 0.0 - config.attention_probs_dropout_prob = 0.0 + # if not is_training: + # config.hidden_dropout_prob = 0.0 + # config.attention_probs_dropout_prob = 0.0 input_shape = get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] @@ -365,7 +365,7 @@ class BertModel(object): position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, - dropout_prob=config.hidden_dropout_prob, + # dropout_prob=config.hidden_dropout_prob, trainable=sent_bert_trainable) with tf.variable_scope("encoder"): @@ -385,8 +385,8 @@ class BertModel(object): num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=get_activation(config.hidden_act), - hidden_dropout_prob=config.hidden_dropout_prob, - attention_probs_dropout_prob=config.attention_probs_dropout_prob, + # hidden_dropout_prob=config.hidden_dropout_prob, + # attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True, trainable=sent_bert_trainable) @@ -541,10 +541,14 @@ def dropout(input_tensor, dropout_prob): Returns: A version of `input_tensor` with dropout applied. """ - if dropout_prob is None or dropout_prob == 0.0: - return input_tensor + # if dropout_prob is None or dropout_prob == 0.0: + # return input_tensor + + # TODO: Annotate by TC + # output = tf.nn.dropout(input_tensor, rate=dropout_prob) + # TODO: Update by:TC + output = npu_ops.dropout(input_tensor, dropout_prob) - output = tf.nn.dropout(input_tensor, rate=dropout_prob) return output @@ -554,10 +558,12 @@ def layer_norm(input_tensor, name=None): inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) -def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): +def layer_norm_and_dropout(input_tensor, name=None): """Runs layer normalization followed by dropout.""" output_tensor = layer_norm(input_tensor, name) - output_tensor = dropout(output_tensor, dropout_prob) + # TODO: 删除dropout + # output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor @@ -628,7 +634,7 @@ def embedding_postprocessor(input_tensor, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, - dropout_prob=0.1, + # dropout_prob=0.1, trainable=True): """Performs various post-processing on a word embedding tensor. @@ -715,7 +721,7 @@ def embedding_postprocessor(input_tensor, position_broadcast_shape) output += position_embeddings - output = layer_norm_and_dropout(output, dropout_prob) + output = layer_norm_and_dropout(output) return output @@ -1052,7 +1058,9 @@ def attention_layer(from_tensor, # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = dropout(attention_probs, attention_probs_dropout_prob) + + # TODO: 删除dropout + # attention_probs = dropout(attention_probs, attention_probs_dropout_prob) # `context_layer` = [B, F, N, H] context_layer = tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_layer) @@ -1068,7 +1076,7 @@ def transformer_model(input_tensor, intermediate_size=3072, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, + # attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False, trainable=True): @@ -1139,7 +1147,7 @@ def transformer_model(input_tensor, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, - attention_probs_dropout_prob=attention_probs_dropout_prob, + # attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, trainable=trainable) @@ -1155,7 +1163,10 @@ def transformer_model(input_tensor, None, "dense", trainable=trainable) - attention_output = dropout(attention_output, hidden_dropout_prob) + + # TODO: 删除dropout + # attention_output = dropout(attention_output, hidden_dropout_prob) + # Implementation of residual connections. attention_output = layer_norm( input_tensor=attention_output + layer_input) @@ -1179,7 +1190,9 @@ def transformer_model(input_tensor, None, "dense", trainable=trainable) - layer_output = dropout(layer_output, hidden_dropout_prob) + # TODO: 删除dropout + # layer_output = dropout(layer_output, hidden_dropout_prob) + layer_output = layer_norm( input_tensor=layer_output + attention_output) prev_output = layer_output @@ -1283,4 +1296,3 @@ def assert_rank(tensor, expected_rank, name=None): "For the tensor `%s` in scope `%s`, the actual rank " "`%d` (shape = %s) is not equal to the expected rank `%s`" % (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py deleted file mode 100644 index 7f1254935..000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers_orig.py +++ /dev/null @@ -1,570 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Model layers in dual encoder SMITH model.""" -from npu_bridge.npu_init import * -from six.moves import range -from npu_bridge.estimator.npu import npu_convert_dropout -import tensorflow.compat.v1 as tf - -from smith import constants -from smith.bert import modeling - - -def get_doc_rep_with_masked_sent(input_sent_reps_doc, - sent_mask_embedding, - input_mask_doc_level, - batch_size_static=32, - max_masked_sent_per_doc=2, - loop_sent_number_per_doc=32): - """Get the document representations with masked sentences. - - Args: - input_sent_reps_doc: float Tensor. The independent sentence embeddings - without masks for the sentences in the current document. The shape is - [batch, loop_sent_number_per_doc, hidden]. - sent_mask_embedding: float Tensor. The sentence embedding vector for the - masked position. The shape is [hidden]. - input_mask_doc_level: int Tensor. The input masks on the document level to - identify whether a location is a real sentence (mask = 1) or a padded - sentence (mask = 0). The shape is [batch, loop_sent_number_per_doc]. - batch_size_static: scalar. The static batch size depending on the training - or the evaluation mode. - max_masked_sent_per_doc: scalar. The maximum number of masked sentences - per document. - loop_sent_number_per_doc: scalar. The number of looped sentences per - document. - - Returns: - The document representations with masked sentences and the positions/ - weights for each masked sentences. This masked sentence weight is 1 for the - sampled real sentence position and 0 for the padded sentence position. - """ - # We at least mask two sentences to build a candidate sentence pool for - # negative sentence sampling. We generate the masked_sent_index and - # masked_sent_weight for each document. Note that we do not add any word - # or sentence level masks during prediction or inference stage. - max_masked_sent_per_doc = max(max_masked_sent_per_doc, 2) - input_sent_reps_doc_list = tf.unstack( - input_sent_reps_doc, num=batch_size_static) - real_sent_number_per_doc = tf.unstack( - tf.reduce_sum(input_mask_doc_level, 1), num=batch_size_static) - masked_sent_index_list = [] - masked_sent_weight_list = [] - - # For each example in the current batch, we randomly sample - # max_masked_sent_per_doc positions to mask the sentences. For each masked - # sentence position, the sentence in the current position is the positive - # example. The other co-masked sentences are the negative examples. - # The sampled sentence indexes will not be duplicated. - for batch_i in range(0, batch_size_static): - # Since everything in TPU must have a fixed shape, here the max sampled - # sentence index can be as large as loop_sent_number_per_doc. We will - # generate the corresponding sentence LM weights to reduce the impact - # on the final masked sentence LM loss following a similar way with the - # handling of masked word LM loss and masked word LM weights. - real_sent_number = real_sent_number_per_doc[batch_i] - sampled_sent_index = tf.slice( - tf.random_shuffle(tf.range(loop_sent_number_per_doc)), [0], - [max_masked_sent_per_doc]) - sampled_sent_index = tf.sort(sampled_sent_index) - masked_sent_index_list.append(sampled_sent_index) - # Generates the corresponding sampled_sent_weight - sample_sent_weight = tf.cast( - tf.less(sampled_sent_index, real_sent_number), tf.float32) - masked_sent_weight_list.append(sample_sent_weight) - - indices = tf.reshape(sampled_sent_index, [max_masked_sent_per_doc, -1]) - # Duplicates sent_mask_embedding for each masked position. - updates = tf.reshape( - tf.tile( - sent_mask_embedding, - [max_masked_sent_per_doc], - ), [max_masked_sent_per_doc, -1]) - input_sent_reps_doc_list[batch_i] = tf.tensor_scatter_update( - input_sent_reps_doc_list[batch_i], indices, updates) - # Here masked_sent_index_list is a list a tensors, where each tensor stores - # the masked sentence positions for each document in the current batch. The - # shape of masked_sent_index_list is [batch, max_masked_sent_per_doc]. - # Here masked_sent_weight_list is a list a tensors, where each tensor stores - # the masked sentence weights for each document in the current batch. The - # shape of masked_sent_weight_list is [batch, max_masked_sent_per_doc]. - return (tf.stack(input_sent_reps_doc_list), tf.stack(masked_sent_index_list), - tf.stack(masked_sent_weight_list)) - - -def get_masked_sent_lm_output(bert_config, - input_tensor, - cur_sent_reps_doc_unmask, - sent_masked_positions, - sent_masked_weights, - debugging=False): - """Get the sentence level masked LM loss. - - Args: - bert_config: BertConfig object. The configuration file for the document - level BERT model. - input_tensor: float Tensor. The contextualized representations of all - sentences learned by the document level BERT model. The shape is [batch, - loop_sent_number_per_doc, hidden]. This is the model prediction. - cur_sent_reps_doc_unmask: float Tensor. The unmasked sentence - representations of the current document. The shape is [batch, - loop_sent_number_per_doc, hidden]. This is the source of the ground - truth and negative examples in the masked sentence prediction. - sent_masked_positions: int Tensor. The masked sentence positions in the - current document. The shape is [batch, max_masked_sent_per_doc]. - sent_masked_weights: float Tensor. The masked sentence weights in the - current document. The shape is [batch, max_masked_sent_per_doc]. - debugging: bool. Whether it is in the debugging mode. - - Returns: - The masked sentence LM loss and the mask sentence LM loss per example. - - """ - # The current method for masked sentence prediction: we approach this problem - # as a multi-class classification problem similar to the masked word LM task. - # For each masked sentence position, the sentence in the current position is - # the positive example. The other co-masked sentences in the current document - # and in the other documents of the same batch are the negative examples. We - # compute the cross entropy loss over the sentence prediction task following - # the implementation of the masked word LM loss in the BERT model. - - input_tensor_shape = modeling.get_shape_list(input_tensor) - batch_size = input_tensor_shape[0] - masked_position_shape = modeling.get_shape_list(sent_masked_positions) - max_predictions_per_seq = masked_position_shape[1] - - # In the context of masked sentence prediction, the max_predictions_per_seq - # is the same with max_masked_sent_per_doc. - # Output Shape: [batch * max_predictions_per_seq, hidden]. - # Input_tensor is the model prediction for each position. - input_tensor = gather_indexes(input_tensor, sent_masked_positions) - # Independent_sent_embeddings is the ground truth input sentence embeddings - # for the document level BERT model. The output shape is [batch * - # max_predictions_per_seq, hidden]. - independent_sent_embeddings = gather_indexes(cur_sent_reps_doc_unmask, - sent_masked_positions) - - with tf.variable_scope("cls/sent_predictions", reuse=tf.AUTO_REUSE): - # We apply one more non-linear transformation before the output layer. - # This matrix is not used after pre-training. - with tf.variable_scope("transform"): - input_tensor = tf.layers.dense( - input_tensor, - units=bert_config.hidden_size, - activation=modeling.get_activation(bert_config.hidden_act), - kernel_initializer=modeling.create_initializer( - bert_config.initializer_range)) - # Output Shape: [batch * max_predictions_per_seq, hidden]. - input_tensor = modeling.layer_norm(input_tensor) - - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each predicted position. - output_bias = tf.get_variable( - "output_bias", - shape=[batch_size * max_predictions_per_seq], - initializer=tf.zeros_initializer()) - # Shape of input_tensor [batch * max_predictions_per_seq, hidden]. - # Shape of independent_sent_embeddings is [batch * max_predictions_per_seq, - # hidden]. - # Shape of logits: [batch * max_predictions_per_seq, - # batch * max_predictions_per_seq]. - logits = tf.matmul( - input_tensor, independent_sent_embeddings, transpose_b=True) - logits = tf.nn.bias_add(logits, output_bias) - # Output Shape: [batch * max_predictions_per_seq, - # batch * max_predictions_per_seq]. - log_probs = tf.nn.log_softmax(logits, axis=-1) - - # Output Shape: [batch * max_predictions_per_seq]. - # Double checked the setting of label_ids here. The label_ids - # should be the label index in the "sentence vocabulary". Thus if batch=32, - # max_predictions_per_seq = 2, then label ids should be like - # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ..., 63]. For the ground truth one hot - # label matrix, only the values in the diagonal positions are 1. All the - # other positions should be 0. - label_ids = tf.range( - 0, batch_size * max_predictions_per_seq, dtype=tf.int32) - if debugging: - label_ids = tf.Print( - label_ids, [label_ids], - message="label_ids in get_masked_sent_lm_output", - summarize=30) - # Output Shape: [batch * max_predictions_per_seq]. - # The label_weights is the flatten vector based on sent_masked_weights, - # where the weight is 1.0 for sampled real sentences and 0.0 for sampled - # masked sentences. - label_weights = tf.reshape(sent_masked_weights, [-1]) - - # Output Shape: [batch * max_predictions_per_seq, - # batch * max_predictions_per_seq]. - one_hot_labels = tf.one_hot( - label_ids, depth=batch_size * max_predictions_per_seq, dtype=tf.float32) - - # Output Shape: [batch * max_predictions_per_seq]. - per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) - # Output Shape: [1]. - numerator = tf.reduce_sum(label_weights * per_example_loss) - # Output Shape: [1]. - denominator = tf.reduce_sum(label_weights) + 1e-5 - # Output Shape: [1]. - loss = numerator / denominator - # Shape of loss [1]. - # Shape of per_example_loss is [batch * max_predictions_per_seq]. - return (loss, per_example_loss, log_probs) - - -def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, - label_ids, label_weights): - """Get loss and log probs for the masked LM.""" - # Output Shape: [batch * max_predictions_per_seq, hidden]. - input_tensor = gather_indexes(input_tensor, positions) - - with tf.variable_scope("cls/word_predictions", reuse=tf.AUTO_REUSE): - # We apply one more non-linear transformation before the output layer. - # This matrix is not used after pre-training. - with tf.variable_scope("transform"): - input_tensor = tf.layers.dense( - input_tensor, - units=bert_config.hidden_size, - activation=modeling.get_activation(bert_config.hidden_act), - kernel_initializer=modeling.create_initializer( - bert_config.initializer_range)) - # Output Shape: [batch * max_predictions_per_seq, hidden]. - input_tensor = modeling.layer_norm(input_tensor) - - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - output_bias = tf.get_variable( - "output_bias", - shape=[bert_config.vocab_size], - initializer=tf.zeros_initializer()) - # Shape of input_tensor [batch * max_predictions_per_seq, embedding_size]. - # Shape of output_weights (embed table) is [vocab_size, embedding_size]. - # In the current Bert implementation: embedding_size = hidden. - logits = tf.matmul(input_tensor, output_weights, transpose_b=True) - logits = tf.nn.bias_add(logits, output_bias) - # Output Shape: [batch * max_predictions_per_seq, vocab_size]. - log_probs = tf.nn.log_softmax(logits, axis=-1) - - # Output Shape: [batch * max_predictions_per_seq]. - label_ids = tf.reshape(label_ids, [-1]) - # Output Shape: [batch * max_predictions_per_seq]. - label_weights = tf.reshape(label_weights, [-1]) - - # Output Shape: [batch * max_predictions_per_seq, vocab_size]. - one_hot_labels = tf.one_hot( - label_ids, depth=bert_config.vocab_size, dtype=tf.float32) - - # The `positions` tensor might be zero-padded (if the sequence is too - # short to have the maximum number of predictions). The `label_weights` - # tensor has a value of 1.0 for every real prediction and 0.0 for the - # padding predictions. - # Output Shape: [batch * max_predictions_per_seq]. - per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) - # Output Shape: [1]. - numerator = tf.reduce_sum(label_weights * per_example_loss) - # Output Shape: [1]. - denominator = tf.reduce_sum(label_weights) + 1e-5 - # Output Shape: [1]. - loss = numerator / denominator - # Shape of loss [1]. - # Shape of per_example_loss is [batch * max_predictions_per_seq]. - return (loss, per_example_loss, log_probs) - - -def gather_indexes(sequence_tensor, positions): - """Gathers the vectors at the specific positions over a minibatch.""" - # Shape of positions: [batch, max_mask_per_seq]. - sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) - batch_size = sequence_shape[0] - seq_length = sequence_shape[1] - width = sequence_shape[2] - - # Shape of flat_offsets: [batch, 1]. - flat_offsets = tf.reshape( - tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) - flat_positions = tf.reshape(positions + flat_offsets, [-1]) - flat_sequence_tensor = tf.reshape(sequence_tensor, - [batch_size * seq_length, width]) - output_tensor = tf.gather(flat_sequence_tensor, flat_positions) - # The shape of output_tensor [batch * max_mask_per_seq, hidden]. - return output_tensor - - -def get_attention_weighted_sum(input_tensor, bert_config, is_training, - attention_size): - """Compute the attentive weighted sum of an input tensor. - - Args: - input_tensor: The input tensor for attentive representation. The shape of - input tensor is [batch, seq_length, hidden]. - bert_config: The model config file. - is_training: If true, it is in training mode. - attention_size: int. Dimension of contextual vector. - - Returns: - The attentive representation of the input tensor. The shape of the output - tensor is [batch, hidden]. - """ - with tf.variable_scope("combine_reps_attention", reuse=tf.AUTO_REUSE): - context_vector = tf.get_variable( - name="context_vector", - shape=[attention_size], - dtype=tf.float32) - # Output Shape: [batch, seq_length, attention_size]. - projection = tf.layers.dense( - input_tensor, - attention_size, - activation=tf.tanh, - kernel_initializer=modeling.create_initializer( - bert_config.initializer_range)) - # Output Shape: [batch, seq_length, 1]. - attention = tf.reduce_sum( - tf.multiply(projection, context_vector), axis=2, keep_dims=True) - # Output Shape: [batch, seq_length, 1]. - attention = tf.nn.softmax(attention, axis=1) - # Output Shape: [batch, hidden]. - last_outputs = tf.reduce_sum(tf.multiply(input_tensor, attention), axis=1) - if is_training: - last_outputs = tf.layers.dropout( - last_outputs, bert_config.attention_probs_dropout_prob, training=True) - return last_outputs - - -def get_seq_rep_from_bert(bert_model): - """Get the sequence represenation given a BERT encoder.""" - siamese_input_tensor = bert_model.get_pooled_output() - hidden_size = siamese_input_tensor.shape[-1].value - siamese_input_tensor = tf.layers.dense( - siamese_input_tensor, units=hidden_size, activation=tf.nn.relu) - normalized_siamese_input_tensor = tf.nn.l2_normalize( - siamese_input_tensor, axis=1) - return normalized_siamese_input_tensor - - -def get_sent_reps_masks_normal_loop(sent_index, - input_sent_reps_doc, - input_mask_doc_level, - masked_lm_loss_doc, - masked_lm_example_loss_doc, - masked_lm_weights_doc, - dual_encoder_config, - is_training, - train_mode, - input_ids, - input_mask, - masked_lm_positions, - masked_lm_ids, - masked_lm_weights, - use_one_hot_embeddings, - debugging=False): - """Get the sentence encodings, mask ids and masked word LM loss. - - Args: - sent_index: The index of the current looped sentence. - input_sent_reps_doc: The representations of all sentences in the doc - learned by BERT. - input_mask_doc_level: The document level input masks, which indicates - whether a sentence is a real sentence or a padded sentence. - masked_lm_loss_doc: The sum of all the masked word LM loss. - masked_lm_example_loss_doc: The per example masked word LM loss. - masked_lm_weights_doc: the weights of the maksed LM words. If the position - is corresponding to a real masked word, it is 1.0; It is a padded mask, - the weight is 0. - dual_encoder_config: The config of the dual encoder. - is_training: Whether it is in the training mode. - train_mode: string. The train mode which can be finetune, joint_train, or - pretrain. - input_ids: The ids of the input tokens. - input_mask: The mask of the input tokens. - masked_lm_positions: The positions of the masked words in the language - model training. - masked_lm_ids: The ids of the masked words in LM model training. - masked_lm_weights: The weights of the masked words in LM model training. - use_one_hot_embeddings: Whether use one hot embedding. It should be true - for the runs on TPUs. - debugging: bool. Whether it is in the debugging mode. - - Returns: - A list of tensors on the learned sentence representations and the masked - word LM loss. - """ - # Collect token information for the current sentence. - bert_config = modeling.BertConfig.from_json_file( - dual_encoder_config.encoder_config.bert_config_file) - max_sent_length_by_word = dual_encoder_config.encoder_config.max_sent_length_by_word - sent_bert_trainable = dual_encoder_config.encoder_config.sent_bert_trainable - max_predictions_per_seq = dual_encoder_config.encoder_config.max_predictions_per_seq - sent_start = sent_index * max_sent_length_by_word - input_ids_cur_sent = tf.slice(input_ids, [0, sent_start], - [-1, max_sent_length_by_word]) - # Output shape: [batch, max_sent_length_by_word]. - input_mask_cur_sent = tf.slice(input_mask, [0, sent_start], - [-1, max_sent_length_by_word]) - # Output Shape: [batch]. - input_mask_cur_sent_max = tf.reduce_max(input_mask_cur_sent, 1) - # Output Shape: [loop_sent_number_per_doc, batch]. - input_mask_doc_level.append(input_mask_cur_sent_max) - if debugging: - input_ids_cur_sent = tf.Print( - input_ids_cur_sent, [input_ids_cur_sent, input_mask_cur_sent], - message="input_ids_cur_sent in get_sent_reps_masks_lm_loss", - summarize=20) - model = modeling.BertModel( - config=bert_config, - is_training=is_training, - input_ids=input_ids_cur_sent, - input_mask=input_mask_cur_sent, - use_one_hot_embeddings=use_one_hot_embeddings, - sent_bert_trainable=sent_bert_trainable) - with tf.variable_scope("seq_rep_from_bert_sent_dense", reuse=tf.AUTO_REUSE): - normalized_siamese_input_tensor = get_seq_rep_from_bert(model) - input_sent_reps_doc.append(normalized_siamese_input_tensor) - - if (train_mode == constants.TRAIN_MODE_PRETRAIN or - train_mode == constants.TRAIN_MODE_JOINT_TRAIN): - # Collect masked token information for the current sentence. - sent_mask_lm_token_start = sent_index * max_predictions_per_seq - # Output shape: [batch, max_predictions_per_seq]. - masked_lm_positions_cur_sent = tf.slice(masked_lm_positions, - [0, sent_mask_lm_token_start], - [-1, max_predictions_per_seq]) - masked_lm_ids_cur_sent = tf.slice(masked_lm_ids, - [0, sent_mask_lm_token_start], - [-1, max_predictions_per_seq]) - masked_lm_weights_cur_sent = tf.slice(masked_lm_weights, - [0, sent_mask_lm_token_start], - [-1, max_predictions_per_seq]) - # Since in the processed data of smith model, the masked lm positions are - # global indices started from the 1st token of the whole sequence, we need - # to transform this global position to a local position for the current - # sentence. The position index is started from 0. - # Local_index = global_index mod max_sent_length_by_word. - masked_lm_positions_cur_sent = tf.mod(masked_lm_positions_cur_sent, - max_sent_length_by_word) - # Shape of masked_lm_loss_cur_sent [1]. - # Shape of masked_lm_example_loss_cur_sent is [batch, - # max_predictions_per_seq]. - (masked_lm_loss_cur_sent, masked_lm_example_loss_cur_sent, - _) = get_masked_lm_output(bert_config, model.get_sequence_output(), - model.get_embedding_table(), - masked_lm_positions_cur_sent, - masked_lm_ids_cur_sent, - masked_lm_weights_cur_sent) - # Output Shape: [1]. - masked_lm_loss_doc += masked_lm_loss_cur_sent - # Output Shape: [loop_sent_number_per_doc, batch * max_predictions_per_seq]. - masked_lm_example_loss_doc.append(masked_lm_example_loss_cur_sent) - # Output Shape: [loop_sent_number_per_doc, batch, max_predictions_per_seq]. - masked_lm_weights_doc.append(masked_lm_weights_cur_sent) - return (input_sent_reps_doc, input_mask_doc_level, masked_lm_loss_doc, - masked_lm_example_loss_doc, masked_lm_weights_doc) - - -def learn_sent_reps_normal_loop(dual_encoder_config, is_training, train_mode, - input_ids_1, input_mask_1, - masked_lm_positions_1, masked_lm_ids_1, - masked_lm_weights_1, input_ids_2, input_mask_2, - masked_lm_positions_2, masked_lm_ids_2, - masked_lm_weights_2, use_one_hot_embeddings): - """Learn the sentence representations with normal loop functions.""" - input_sent_reps_doc_1 = [] - # Generate document level input masks on each sentence based on the word - # level input mask information. - input_mask_doc_level_1 = [] - masked_lm_loss_doc_1 = 0.0 - masked_lm_example_loss_doc_1 = [] - masked_lm_weights_doc_1 = [] - - input_mask_doc_level_2 = [] - input_sent_reps_doc_2 = [] - masked_lm_loss_doc_2 = 0.0 - masked_lm_example_loss_doc_2 = [] - masked_lm_weights_doc_2 = [] - - # Learn the representation for each sentence in the document. - # Setting smaller number of loop_sent_number_per_doc can save memory for the - # model training. - # Shape of masked_lm_loss_doc_1 [1]. - # Shape of masked_lm_example_loss_doc_1 is [max_doc_length_by_sentence, - # batch * max_predictions_per_seq]. - for sent_index in range( - 0, dual_encoder_config.encoder_config.loop_sent_number_per_doc): - (input_sent_reps_doc_1, input_mask_doc_level_1, masked_lm_loss_doc_1, - masked_lm_example_loss_doc_1, - masked_lm_weights_doc_1) = get_sent_reps_masks_normal_loop( - sent_index, input_sent_reps_doc_1, input_mask_doc_level_1, - masked_lm_loss_doc_1, masked_lm_example_loss_doc_1, - masked_lm_weights_doc_1, dual_encoder_config, is_training, train_mode, - input_ids_1, input_mask_1, masked_lm_positions_1, masked_lm_ids_1, - masked_lm_weights_1, use_one_hot_embeddings) - (input_sent_reps_doc_2, input_mask_doc_level_2, masked_lm_loss_doc_2, - masked_lm_example_loss_doc_2, - masked_lm_weights_doc_2) = get_sent_reps_masks_normal_loop( - sent_index, input_sent_reps_doc_2, input_mask_doc_level_2, - masked_lm_loss_doc_2, masked_lm_example_loss_doc_2, - masked_lm_weights_doc_2, dual_encoder_config, is_training, train_mode, - input_ids_2, input_mask_2, masked_lm_positions_2, masked_lm_ids_2, - masked_lm_weights_2, use_one_hot_embeddings) - - # Stack the sentence representations to learn the doc representations. - # Output Shape: [batch, loop_sent_number_per_doc, hidden]. - input_sent_reps_doc_1_unmask = tf.stack(input_sent_reps_doc_1, axis=1) - input_sent_reps_doc_2_unmask = tf.stack(input_sent_reps_doc_2, axis=1) - - # Output Shape: [batch, loop_sent_number_per_doc]. - input_mask_doc_level_1_tensor = tf.stack(input_mask_doc_level_1, axis=1) - input_mask_doc_level_2_tensor = tf.stack(input_mask_doc_level_2, axis=1) - - if (train_mode == constants.TRAIN_MODE_PRETRAIN or - train_mode == constants.TRAIN_MODE_JOINT_TRAIN): - # Output Shape: [batch * max_predictions_per_seq, - # loop_sent_number_per_doc]. - masked_lm_example_loss_doc_1 = tf.stack( - masked_lm_example_loss_doc_1, axis=1) - masked_lm_example_loss_doc_2 = tf.stack( - masked_lm_example_loss_doc_2, axis=1) - - # Output Shape: [batch, loop_sent_number_per_doc, max_predictions_per_seq]. - masked_lm_weights_doc_1 = tf.stack(masked_lm_weights_doc_1, axis=1) - masked_lm_weights_doc_2 = tf.stack(masked_lm_weights_doc_2, axis=1) - else: - masked_lm_example_loss_doc_1 = tf.zeros([1]) - masked_lm_example_loss_doc_2 = tf.zeros([1]) - masked_lm_weights_doc_1 = tf.zeros([1]) - masked_lm_weights_doc_2 = tf.zeros([1]) - - return (input_sent_reps_doc_1_unmask, input_mask_doc_level_1_tensor, - input_sent_reps_doc_2_unmask, input_mask_doc_level_2_tensor, - masked_lm_loss_doc_1, masked_lm_loss_doc_2, - masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2, - masked_lm_weights_doc_1, masked_lm_weights_doc_2) - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py deleted file mode 100644 index a2373128f..000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_orig.py +++ /dev/null @@ -1,491 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Dual encoder SMITH models.""" -from npu_bridge.npu_init import * - -import tensorflow.compat.v1 as tf - -from smith import constants -from smith import layers -from smith import loss_fns -from smith import metric_fns -from smith import utils -from smith.bert import modeling -from smith.bert import optimization - -# Add by:TC -import precision_tool.tf_config as npu_tf_config - -def build_smith_dual_encoder(dual_encoder_config, - train_mode, - is_training, - input_ids_1, - input_mask_1, - masked_lm_positions_1, - masked_lm_ids_1, - masked_lm_weights_1, - input_ids_2, - input_mask_2, - masked_lm_positions_2, - masked_lm_ids_2, - masked_lm_weights_2, - use_one_hot_embeddings, - documents_match_labels, - debugging=False): - """Build the dual encoder SMITH model. - - Args: - dual_encoder_config: the configuration file for the dual encoder model. - train_mode: string. The train mode of the current. It can be finetune, - pretrain or joint_train. - is_training: bool. Whether it in training mode. - input_ids_1: int Tensor with shape [batch, max_seq_length]. The input ids of - input examples of text 1. - input_mask_1: int Tensor with shape [batch, max_seq_length]. The input masks - of input examples of text 1. - masked_lm_positions_1: int Tensor with shape [batch, - max_predictions_per_seq]. The input masked LM prediction positions of - input examples of text 1. This can be useful to compute the masked word - prediction LM loss. - masked_lm_ids_1: int Tensor with shape [batch, max_predictions_per_seq]. The - input masked LM prediction ids of input examples of text 1. It is the - ground truth in the masked word LM prediction task. This can be useful to - compute the masked word prediction LM loss. - masked_lm_weights_1: float Tensor with shape [batch, - max_predictions_per_seq]. The input masked LM prediction weights of input - examples of text 1. - input_ids_2: int Tensor with shape [batch, max_seq_length]. The input ids of - input examples of text 2. - input_mask_2: int Tensor with shape [batch, max_seq_length]. The input masks - of input examples of text 2. - masked_lm_positions_2: int Tensor with shape [batch, - max_predictions_per_seq]. The input masked LM prediction positions of - input examples of text 2. This can be useful to compute the masked word - prediction LM loss. - masked_lm_ids_2: int Tensor with shape [batch, max_predictions_per_seq]. The - input masked LM prediction ids of input examples of text 2. It is the - ground truth in the masked word LM prediction task. This can be useful to - compute the masked word prediction LM loss. - masked_lm_weights_2: float Tensor with shape [batch, - max_predictions_per_seq]. The input masked LM prediction weights of input - examples of text 2. - use_one_hot_embeddings: bool. Whether use one hot embeddings. - documents_match_labels: float Tensor with shape [batch]. The ground truth - labels for the input examples. - debugging: bool. Whether it is in the debugging mode. - - Returns: - The masked LM loss, per example LM loss, masked sentence LM loss, per - example masked sentence LM loss, sequence representations, text matching - loss, per example text matching loss, text matching logits, text matching - probabilities and text matching log probabilities. - - Raises: - ValueError: if the doc_rep_combine_mode in dual_encoder_config is invalid. - """ - bert_config = modeling.BertConfig.from_json_file( - dual_encoder_config.encoder_config.bert_config_file) - doc_bert_config = modeling.BertConfig.from_json_file( - dual_encoder_config.encoder_config.doc_bert_config_file) - (input_sent_reps_doc_1_unmask, input_mask_doc_level_1_tensor, - input_sent_reps_doc_2_unmask, input_mask_doc_level_2_tensor, - masked_lm_loss_doc_1, masked_lm_loss_doc_2, masked_lm_example_loss_doc_1, - masked_lm_example_loss_doc_2, masked_lm_weights_doc_1, - masked_lm_weights_doc_2) = layers.learn_sent_reps_normal_loop( - dual_encoder_config, is_training, train_mode, input_ids_1, input_mask_1, - masked_lm_positions_1, masked_lm_ids_1, masked_lm_weights_1, input_ids_2, - input_mask_2, masked_lm_positions_2, masked_lm_ids_2, - masked_lm_weights_2, use_one_hot_embeddings) - if debugging: - input_mask_doc_level_1_tensor = tf.Print( - input_mask_doc_level_1_tensor, - [input_mask_doc_level_1_tensor, input_mask_doc_level_2_tensor], - message="input_mask_doc_level_1_tensor in build_smith_dual_encoder", - summarize=30) - - if dual_encoder_config.encoder_config.use_masked_sentence_lm_loss: - batch_size_static = ( - dual_encoder_config.train_eval_config.train_batch_size if is_training - else dual_encoder_config.train_eval_config.eval_batch_size) - # Generates the sentence masked document represenations. - with tf.variable_scope("mask_sent_in_doc", reuse=tf.AUTO_REUSE): - # Randomly initialize a masked sentence vector and reuse it. - # We also need to return the masked sentence position index to get the - # ground truth labels for the masked positions. The shape of - # sent_mask_embedding is [hidden]. - sent_mask_embedding = tf.get_variable( - name="sentence_mask_embedding", - shape=[bert_config.hidden_size], - initializer=tf.truncated_normal_initializer( - stddev=bert_config.initializer_range)) - # Output Shape: [batch, loop_sent_number_per_doc, hidden]. - (input_sent_reps_doc_1_masked, masked_sent_index_1, - masked_sent_weight_1) = layers.get_doc_rep_with_masked_sent( - input_sent_reps_doc=input_sent_reps_doc_1_unmask, - sent_mask_embedding=sent_mask_embedding, - input_mask_doc_level=input_mask_doc_level_1_tensor, - batch_size_static=batch_size_static, - max_masked_sent_per_doc=dual_encoder_config.encoder_config - .max_masked_sent_per_doc, - loop_sent_number_per_doc=dual_encoder_config.encoder_config - .loop_sent_number_per_doc) - (input_sent_reps_doc_2_masked, masked_sent_index_2, - masked_sent_weight_2) = layers.get_doc_rep_with_masked_sent( - input_sent_reps_doc=input_sent_reps_doc_2_unmask, - sent_mask_embedding=sent_mask_embedding, - input_mask_doc_level=input_mask_doc_level_2_tensor, - batch_size_static=batch_size_static, - max_masked_sent_per_doc=dual_encoder_config.encoder_config - .max_masked_sent_per_doc, - loop_sent_number_per_doc=dual_encoder_config.encoder_config - .loop_sent_number_per_doc) - # Learn the document representations based on masked sentence embeddings. - # Note that the variables in the DocBert model are not within the - # "mask_sent_in_doc" variable scope. - model_doc_1 = modeling.DocBertModel( - config=doc_bert_config, - is_training=is_training, - input_reps=input_sent_reps_doc_1_masked, - input_mask=input_mask_doc_level_1_tensor) - model_doc_2 = modeling.DocBertModel( - config=doc_bert_config, - is_training=is_training, - input_reps=input_sent_reps_doc_2_masked, - input_mask=input_mask_doc_level_2_tensor) - # Shape of masked_sent_lm_loss_1 [1]. - # Shape of masked_sent_lm_example_loss_1 is [batch * - # max_predictions_per_seq]. - (masked_sent_lm_loss_1, masked_sent_per_example_loss_1, - _) = layers.get_masked_sent_lm_output(doc_bert_config, - model_doc_1.get_sequence_output(), - input_sent_reps_doc_1_unmask, - masked_sent_index_1, - masked_sent_weight_1) - (masked_sent_lm_loss_2, masked_sent_per_example_loss_2, - _) = layers.get_masked_sent_lm_output(doc_bert_config, - model_doc_2.get_sequence_output(), - input_sent_reps_doc_2_unmask, - masked_sent_index_2, - masked_sent_weight_2) - else: - # Learn the document representations based on unmasked sentence embeddings. - model_doc_1 = modeling.DocBertModel( - config=doc_bert_config, - is_training=is_training, - input_reps=input_sent_reps_doc_1_unmask, - input_mask=input_mask_doc_level_1_tensor) - model_doc_2 = modeling.DocBertModel( - config=doc_bert_config, - is_training=is_training, - input_reps=input_sent_reps_doc_2_unmask, - input_mask=input_mask_doc_level_2_tensor) - masked_sent_lm_loss_1 = 0 - masked_sent_lm_loss_2 = 0 - masked_sent_per_example_loss_1 = tf.zeros(1) - masked_sent_per_example_loss_2 = tf.zeros(1) - masked_sent_weight_1 = tf.zeros(1) - masked_sent_weight_2 = tf.zeros(1) - - with tf.variable_scope("seq_rep_from_bert_doc_dense", reuse=tf.AUTO_REUSE): - normalized_doc_rep_1 = layers.get_seq_rep_from_bert(model_doc_1) - normalized_doc_rep_2 = layers.get_seq_rep_from_bert(model_doc_2) - - # We also dump the contextualized sentence embedding output by document - # level Transformer model. These representations maybe useful for sentence - # level tasks. - output_sent_reps_doc_1 = model_doc_1.get_sequence_output() - output_sent_reps_doc_2 = model_doc_2.get_sequence_output() - - # Here we support multiple modes to generate the final document - # representations based on the word/sentence/document level representations - # 1. normal: only use the document level representation as the final document - # representations. - # 2. sum_concat: firstly compute the sum of all sentence level repsentations. - # Then concatenate the sum vector with the document level representations. - # 3. mean_concat: firstly compute the mean of all sentence level - # repsentations. Then concatenate the mean vector with the document level - # representations. - # 4. attention: firstly compute the weighted sum of sentence level - # representations with attention mechanism, then concatenate the weighted sum - # vector with the document level representations. - # The document level mask is to indicate whether each sentence is - # a real sentence (1) or a paded sentence (0). The shape of - # input_mask_doc_level_1_tensor is [batch, max_doc_length_by_sentence]. The - # shape of input_sent_reps_doc_1_unmask is - # [batch, max_doc_length_by_sentence, hidden]. - final_doc_rep_combine_mode = dual_encoder_config.encoder_config.doc_rep_combine_mode - if final_doc_rep_combine_mode == constants.DOC_COMBINE_NORMAL: - final_doc_rep_1 = normalized_doc_rep_1 - final_doc_rep_2 = normalized_doc_rep_2 - elif final_doc_rep_combine_mode == constants.DOC_COMBINE_SUM_CONCAT: - # Output Shape: [batch, 2*hidden]. - final_doc_rep_1 = tf.concat( - [tf.reduce_sum(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1], - axis=1) - final_doc_rep_2 = tf.concat( - [tf.reduce_sum(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2], - axis=1) - elif final_doc_rep_combine_mode == constants.DOC_COMBINE_MEAN_CONCAT: - final_doc_rep_1 = tf.concat( - [tf.reduce_mean(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1], - axis=1) - final_doc_rep_2 = tf.concat( - [tf.reduce_mean(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2], - axis=1) - elif final_doc_rep_combine_mode == constants.DOC_COMBINE_ATTENTION: - final_doc_rep_1 = tf.concat([ - layers.get_attention_weighted_sum( - input_sent_reps_doc_1_unmask, bert_config, is_training, - dual_encoder_config.encoder_config.doc_rep_combine_attention_size), - normalized_doc_rep_1 - ], - axis=1) - final_doc_rep_2 = tf.concat([ - layers.get_attention_weighted_sum( - input_sent_reps_doc_2_unmask, bert_config, is_training, - dual_encoder_config.encoder_config.doc_rep_combine_attention_size), - normalized_doc_rep_2 - ], - axis=1) - else: - raise ValueError("Only normal, sum_concat, mean_concat and attention are" - " supported: %s" % final_doc_rep_combine_mode) - (siamese_loss, siamese_example_loss, - siamese_logits) = loss_fns.get_prediction_loss_cosine( - input_tensor_1=final_doc_rep_1, - input_tensor_2=final_doc_rep_2, - labels=documents_match_labels, - similarity_score_amplifier=dual_encoder_config.loss_config - .similarity_score_amplifier, - neg_to_pos_example_ratio=dual_encoder_config.train_eval_config - .neg_to_pos_example_ratio) - - # The shape of masked_lm_loss_doc is [1]. - # The shape of masked_lm_example_loss_doc is [batch * max_predictions_per_seq, - # max_doc_length_by_sentence]. - return (masked_lm_loss_doc_1, masked_lm_loss_doc_2, - masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2, - masked_lm_weights_doc_1, masked_lm_weights_doc_2, - masked_sent_lm_loss_1, masked_sent_lm_loss_2, - masked_sent_per_example_loss_1, masked_sent_per_example_loss_2, - masked_sent_weight_1, masked_sent_weight_2, final_doc_rep_1, - final_doc_rep_2, input_sent_reps_doc_1_unmask, - input_sent_reps_doc_2_unmask, output_sent_reps_doc_1, - output_sent_reps_doc_2, siamese_loss, siamese_example_loss, - siamese_logits) - - -def model_fn_builder(dual_encoder_config, - train_mode, - learning_rate, - num_train_steps, - num_warmup_steps, - use_tpu, - use_one_hot_embeddings, - debugging=False): - """Returns `model_fn` closure for TPUEstimator.""" - - def model_fn(features, labels, mode, params): # pylint: disable=unused-argument - """The `model_fn` for TPUEstimator.""" - tf.logging.info("*** Current mode: %s ***" % mode) - tf.logging.info("*** Features ***") - for name in sorted(features.keys()): - tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) - - input_ids_1 = features["input_ids_1"] - input_mask_1 = features["input_mask_1"] - if train_mode == constants.TRAIN_MODE_FINETUNE: - masked_lm_positions_1 = tf.zeros([1]) - masked_lm_ids_1 = tf.zeros([1]) - masked_lm_weights_1 = tf.zeros([1]) - else: - masked_lm_positions_1 = features["masked_lm_positions_1"] - masked_lm_ids_1 = features["masked_lm_ids_1"] - masked_lm_weights_1 = features["masked_lm_weights_1"] - - input_ids_2 = features["input_ids_2"] - input_mask_2 = features["input_mask_2"] - if train_mode == constants.TRAIN_MODE_FINETUNE: - masked_lm_positions_2 = tf.zeros([1]) - masked_lm_ids_2 = tf.zeros([1]) - masked_lm_weights_2 = tf.zeros([1]) - else: - masked_lm_positions_2 = features["masked_lm_positions_2"] - masked_lm_ids_2 = features["masked_lm_ids_2"] - masked_lm_weights_2 = features["masked_lm_weights_2"] - documents_match_labels = features["documents_match_labels"] - # Since the document_match_labels might contain labels like 0/1/2, we need - # to transfer these labels to binary labels like 0/1. - documents_match_labels = tf.cast(documents_match_labels > 0, tf.float32) - is_real_example = None - if "is_real_example" in features: - is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) - else: - is_real_example = tf.ones( - tf.shape(documents_match_labels), dtype=tf.float32) - - is_training = (mode == tf.estimator.ModeKeys.TRAIN) - - if (dual_encoder_config.encoder_config.model_name == - constants.MODEL_NAME_SMITH_DUAL_ENCODER): - # For the smith model, since the actual looped number of sentences per - # document maybe smaller than max_doc_length_by_sentence, we need to - # overwrite the lm weights with the actual lm weights returned by the - # function. - (masked_lm_loss_1, masked_lm_loss_2, masked_lm_example_loss_1, - masked_lm_example_loss_2, masked_lm_weights_1, masked_lm_weights_2, - masked_sent_lm_loss_1, masked_sent_lm_loss_2, - masked_sent_per_example_loss_1, masked_sent_per_example_loss_2, - masked_sent_weight_1, masked_sent_weight_2, seq_embed_1, seq_embed_2, - input_sent_embed_1, input_sent_embed_2, output_sent_embed_1, - output_sent_embed_2, siamese_loss, - siamese_example_loss, siamese_logits) = build_smith_dual_encoder( - dual_encoder_config, train_mode, is_training, input_ids_1, - input_mask_1, masked_lm_positions_1, masked_lm_ids_1, - masked_lm_weights_1, input_ids_2, input_mask_2, - masked_lm_positions_2, masked_lm_ids_2, masked_lm_weights_2, - use_one_hot_embeddings, documents_match_labels, debugging) - else: - raise ValueError( - "Only smith_dual_encoder is supported: %s" % - dual_encoder_config.encoder_config.model_name) - - # There are three different modes for training in the smith model. - # 1. joint_train: a multi-task learning setting which combines the masked - # word LM losses for doc1/doc2 and the siamese matching loss. If we add the - # masked sentence LM task, we also add the masked sentence LM losses for - # the two documents. - # 2. pretrain: only contains the masked word LM losses for doc1/doc2. We - # currently didn't include the NSP loss since NSP loss is not very useful - # according to the XLNet/ RoBERTa/ ALBERT paper. If we add the masked - # sentence LM task, we also add the masked sentence LM losses for the - # two documents. - # 3. finetune: fine tune the model with loaded pretrained checkpoint only - # with the siamese matching loss. If we add the masked sentence LM task, - # we also add the masked sentence LM losses for the two documents. - if train_mode == constants.TRAIN_MODE_JOINT_TRAIN: - total_loss = masked_lm_loss_1 + masked_lm_loss_2 + siamese_loss - elif train_mode == constants.TRAIN_MODE_PRETRAIN: - total_loss = masked_lm_loss_1 + masked_lm_loss_2 - elif train_mode == constants.TRAIN_MODE_FINETUNE: - total_loss = siamese_loss - else: - raise ValueError("Only joint_train, pretrain, finetune are supported.") - # If we add the masked sentence LM task, we also add the masked sentence - # LM losses for the two documents. - if dual_encoder_config.encoder_config.use_masked_sentence_lm_loss: - total_loss += (masked_sent_lm_loss_1 + masked_sent_lm_loss_2) - - total_loss = tf.identity(total_loss, name='total_loss') - - tvars = tf.trainable_variables() - initialized_variable_names = {} - scaffold_fn = None - init_checkpoint = dual_encoder_config.encoder_config.init_checkpoint - # Load pretrained BERT checkpoints if there is a specified path. - if init_checkpoint: - tf.logging.info("**** Passed pretrained BERT checkpoint = %s ****", - init_checkpoint) - (assignment_map, initialized_variable_names - ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) - if use_tpu: - - def tpu_scaffold(): - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - return tf.train.Scaffold() - - scaffold_fn = tpu_scaffold - else: - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - - tf.logging.info("**** Trainable Variables ****") - for var in tvars: - init_string = ", *INIT_RANDOMLY*" - if var.name in initialized_variable_names: - init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, - init_string) - output_spec = None - predicted_score = tf.sigmoid(siamese_logits) - predicted_class = tf.round(predicted_score) - - if dual_encoder_config.encoder_config.model_name == constants.MODEL_NAME_SMITH_DUAL_ENCODER: - _, prediction_dict = utils.get_export_outputs_prediction_dict_smith_de( - seq_embed_1, seq_embed_2, predicted_score, predicted_class, - documents_match_labels, input_sent_embed_1, input_sent_embed_2, - output_sent_embed_1, output_sent_embed_2) - else: - raise ValueError("Unsupported model: %s" % dual_encoder_config.encoder_config.model_name) - - if mode == tf.estimator.ModeKeys.TRAIN: - train_op = optimization.create_optimizer(total_loss, learning_rate, - num_train_steps, - num_warmup_steps, use_tpu) - # Add by:TC 20220705 - output_spec = tf.estimator.EstimatorSpec( - mode=mode, - loss=total_loss, - train_op=train_op, - training_hooks=[npu_tf_config.estimator_dump()]) - - elif mode == tf.estimator.ModeKeys.EVAL: - if (train_mode == constants.TRAIN_MODE_JOINT_TRAIN or - train_mode == constants.TRAIN_MODE_PRETRAIN): - eval_metrics = (metric_fns.metric_fn_pretrain, [ - masked_lm_example_loss_1, masked_lm_weights_1, - masked_sent_per_example_loss_1, masked_sent_weight_1, - masked_lm_example_loss_2, masked_lm_weights_2, - masked_sent_per_example_loss_2, masked_sent_weight_2, - predicted_class, documents_match_labels, is_real_example - ]) - elif train_mode == constants.TRAIN_MODE_FINETUNE: - eval_metrics = (metric_fns.metric_fn_finetune, [ - predicted_class, documents_match_labels, siamese_example_loss, - is_real_example - ]) - else: - raise ValueError("Only joint_train, pretrain, finetune are supported.") - output_spec = tf.estimator.tpu.TPUEstimatorSpec( - mode=mode, - loss=total_loss, - eval_metrics=eval_metrics, - scaffold_fn=scaffold_fn) - - elif mode == tf.estimator.ModeKeys.PREDICT: - output_spec = tf.estimator.tpu.TPUEstimatorSpec( - mode=mode, predictions=prediction_dict, scaffold_fn=scaffold_fn) - else: - raise ValueError("Only TRAIN, EVAL, PREDICT modes are supported: %s" % mode) - - return output_spec - - return model_fn - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py deleted file mode 100644 index 2818aabc8..000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_orig.py +++ /dev/null @@ -1,547 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Library to preprocess text data into SMITH dual encoder model inputs.""" -from npu_bridge.npu_init import * -import collections -import random -import nltk -import tensorflow.compat.v1 as tf -import tqdm -from smith import utils -from smith import wiki_doc_pair_pb2 -from smith.bert import tokenization - -flags = tf.flags - -FLAGS = flags.FLAGS - -flags.DEFINE_string("input_file", None, "Input data path.") - -flags.DEFINE_string( - "output_file", None, - "Output TF examples (or comma-separated list of files) in TFRecord " - "files.") - -flags.DEFINE_string("vocab_file", None, - "The vocabulary file that the SMITH model was trained on.") - -flags.DEFINE_bool( - "do_lower_case", True, - "Whether to lower case the input text. Should be True for uncased " - "models and False for cased models.") - -flags.DEFINE_bool("add_masks_lm", True, - "If true, add masks for word prediction LM pre-training.") - -flags.DEFINE_integer( - "max_sent_length_by_word", 32, "The maximum length of a sentence by tokens." - "A sentence will be cut off if longer than this length, and will be padded " - "if shorter than it. The sentence can also be a sentence block.") - -flags.DEFINE_integer( - "max_doc_length_by_sentence", 64, - "The maximum length of a document by sentences. A " - "document will be cut off if longer than this length, and" - "will be padded if shorter than it.") - -flags.DEFINE_bool( - "greedy_sentence_filling", True, - "If true, apply the greedy sentence filling trick to reduce the " - "number of padded tokens.") - -flags.DEFINE_integer("max_predictions_per_seq", 5, - "Maximum number of masked LM predictions per sequence.") - -flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") - -flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") - - -class TrainingInstance(object): - """A single training instance (sentence pair as dual encoder model inputs).""" - - def __init__(self, - tokens_1, - segment_ids_1, - masked_lm_positions_1, - masked_lm_labels_1, - input_mask_1, - masked_lm_weights_1, - tokens_2, - segment_ids_2, - masked_lm_positions_2, - masked_lm_labels_2, - input_mask_2, - masked_lm_weights_2, - instance_id, - documents_match_labels=-1.0): - self.tokens_1 = tokens_1 - self.segment_ids_1 = segment_ids_1 - self.masked_lm_positions_1 = masked_lm_positions_1 - self.masked_lm_labels_1 = masked_lm_labels_1 - self.input_mask_1 = input_mask_1 - self.masked_lm_weights_1 = masked_lm_weights_1 - self.tokens_2 = tokens_2 - self.segment_ids_2 = segment_ids_2 - self.masked_lm_positions_2 = masked_lm_positions_2 - self.masked_lm_labels_2 = masked_lm_labels_2 - self.input_mask_2 = input_mask_2 - self.masked_lm_weights_2 = masked_lm_weights_2 - self.instance_id = instance_id - self.documents_match_labels = documents_match_labels - - def __str__(self): - s = "" - s += "instance_id: %s\n" % self.instance_id - s += "documents_match_labels: %s\n" % (str(self.documents_match_labels)) - s += "tokens_1: %s\n" % (" ".join( - [tokenization.printable_text(x) for x in self.tokens_1])) - s += "segment_ids_1: %s\n" % (" ".join([str(x) for x in self.segment_ids_1 - ])) - s += "masked_lm_positions_1: %s\n" % (" ".join( - [str(x) for x in self.masked_lm_positions_1])) - s += "masked_lm_labels_1: %s\n" % (" ".join( - [tokenization.printable_text(x) for x in self.masked_lm_labels_1])) - s += "input_mask_1: %s\n" % (" ".join([str(x) for x in self.input_mask_1])) - s += "masked_lm_weights_1: %s\n" % (" ".join( - [str(x) for x in self.masked_lm_weights_1])) - s += "tokens_2: %s\n" % (" ".join( - [tokenization.printable_text(x) for x in self.tokens_2])) - s += "segment_ids_2: %s\n" % (" ".join([str(x) for x in self.segment_ids_2 - ])) - s += "masked_lm_positions_2: %s\n" % (" ".join( - [str(x) for x in self.masked_lm_positions_2])) - s += "masked_lm_labels_2: %s\n" % (" ".join( - [tokenization.printable_text(x) for x in self.masked_lm_labels_2])) - s += "input_mask_2: %s\n" % (" ".join([str(x) for x in self.input_mask_2])) - s += "masked_lm_weights_2: %s\n" % (" ".join( - [str(x) for x in self.masked_lm_weights_2])) - s += "\n" - return s - - def __repr__(self): - return self.__str__() - - -def add_features_for_one_doc(features, tokens, segment_ids, input_mask, - masked_lm_positions, masked_lm_labels, - masked_lm_weights, tokenizer, doc_index): - """Add features for one document in a WikiDocPair example.""" - input_ids = tokenizer.convert_tokens_to_ids(tokens) - features["input_ids_" + doc_index] = utils.create_int_feature(input_ids) - features["input_mask_" + doc_index] = utils.create_int_feature(input_mask) - features["segment_ids_" + doc_index] = utils.create_int_feature(segment_ids) - - if masked_lm_labels: - masked_lm_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels) - features["masked_lm_positions_" + - doc_index] = utils.create_int_feature(masked_lm_positions) - features["masked_lm_ids_" + - doc_index] = utils.create_int_feature(masked_lm_ids) - features["masked_lm_weights_" + - doc_index] = utils.create_float_feature(masked_lm_weights) - - -def write_instance_to_example_files(instances, tokenizer, output_files): - """Create TF example files from `TrainingInstance`s.""" - writers = [] - for output_file in output_files: - writers.append(tf.python_io.TFRecordWriter(output_file)) - writer_index = 0 - total_written = 0 - for (inst_index, instance) in enumerate(instances): - features = collections.OrderedDict() - add_features_for_one_doc( - features=features, - tokens=instance.tokens_1, - segment_ids=instance.segment_ids_1, - input_mask=instance.input_mask_1, - masked_lm_positions=instance.masked_lm_positions_1, - masked_lm_labels=instance.masked_lm_labels_1, - masked_lm_weights=instance.masked_lm_weights_1, - tokenizer=tokenizer, - doc_index="1") - add_features_for_one_doc( - features=features, - tokens=instance.tokens_2, - segment_ids=instance.segment_ids_2, - input_mask=instance.input_mask_2, - masked_lm_positions=instance.masked_lm_positions_2, - masked_lm_labels=instance.masked_lm_labels_2, - masked_lm_weights=instance.masked_lm_weights_2, - tokenizer=tokenizer, - doc_index="2") - # Adds fields on more content/id information of the current example. - features["instance_id"] = utils.create_bytes_feature( - [bytes(instance.instance_id, "utf-8")]) - features["tokens_1"] = utils.create_bytes_feature( - [bytes(t, "utf-8") for t in instance.tokens_1]) - features["tokens_2"] = utils.create_bytes_feature( - [bytes(t, "utf-8") for t in instance.tokens_2]) - # Adds the documents matching labels. - features["documents_match_labels"] = utils.create_float_feature( - [float(instance.documents_match_labels)]) - tf_example = tf.train.Example(features=tf.train.Features(feature=features)) - - writers[writer_index].write(tf_example.SerializeToString()) - writer_index = (writer_index + 1) % len(writers) - - total_written += 1 - - if inst_index < 5: - tf.logging.info("*** Example ***") - tf.logging.info( - "tokens_1: %s" % - " ".join([tokenization.printable_text(x) for x in instance.tokens_1])) - tf.logging.info( - "tokens_2: %s" % - " ".join([tokenization.printable_text(x) for x in instance.tokens_2])) - - for feature_name in features.keys(): - feature = features[feature_name] - values = [] - if feature.int64_list.value: - values = feature.int64_list.value - elif feature.float_list.value: - values = feature.float_list.value - elif feature.bytes_list.value: - values = feature.bytes_list.value - tf.logging.info("%s: %s" % - (feature_name, " ".join([str(x) for x in values]))) - - for writer in writers: - writer.close() - - tf.logging.info("Wrote %d total instances", total_written) - - -def get_smith_model_tokens(input_text, tokenizer, sent_token_counter): - """Generate tokens given an input text for the SMITH model.""" - res_tokens = [] - for sent in nltk.tokenize.sent_tokenize(input_text): - # The returned res_tokens is a 2D list to maintain the sentence boundary - # information. We removed all the empty tokens in this step. - if not sent: - continue - tokens = [w for w in tokenizer.tokenize(sent) if w] - sent_token_counter[0] += 1 # Track number of sentences. - sent_token_counter[1] += len(tokens) # Track number of tokens. - res_tokens.append(tokens) - return (res_tokens, sent_token_counter) - - -def create_training_instances_wiki_doc_pair( - input_file, tokenizer, max_sent_length_by_word, max_doc_length_by_sentence, - masked_lm_prob, max_predictions_per_seq, rng): - """Create `TrainingInstance`s from WikiDocPair proto data.""" - # The input data is in the WikiDocPair proto format in tfrecord. - # Add by:TC - wiki_doc_pair = wiki_doc_pair_pb2.WikiDocPair() - instances = [] - # Add some counters to track some data statistics. - sent_token_counter = [0, 0] - for example in tqdm.tqdm(tf.python_io.tf_record_iterator(input_file)): - doc_pair = wiki_doc_pair.FromString(example) - # If model_name = smith_dual_encoder, we firstly use a sentence tokenizer - # to split doc_one/doc_two texts into different sentences and use [SEN] to - # label the sentence boundary information. So in the masking and padding - # step, we know the boundary between different sentences and we can do the - # masking and padding according to the actual length of each sentence. - doc_one_text = " \n\n\n\n\n\n ".join( - [a.text for a in doc_pair.doc_one.section_contents]) - doc_two_text = " \n\n\n\n\n\n ".join( - [a.text for a in doc_pair.doc_two.section_contents]) - doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip() - doc_two_text = tokenization.convert_to_unicode(doc_two_text).strip() - doc_one_tokens, sent_token_counter = get_smith_model_tokens( - doc_one_text, tokenizer, sent_token_counter) - doc_two_tokens, sent_token_counter = get_smith_model_tokens( - doc_two_text, tokenizer, sent_token_counter) - # Skip the document pairs if any document is empty. - if not doc_one_tokens or not doc_two_tokens: - continue - vocab_words = list(tokenizer.vocab.keys()) - instance_id = doc_pair.id - if doc_pair.human_label_for_classification: - doc_match_label = doc_pair.human_label_for_classification - else: - # Set the label as 0.0 if there are no available labels. - doc_match_label = 0.0 - instances.append( - create_instance_from_wiki_doc_pair( - instance_id, doc_match_label, doc_one_tokens, doc_two_tokens, - max_sent_length_by_word, max_doc_length_by_sentence, masked_lm_prob, - max_predictions_per_seq, vocab_words, rng)) - rng.shuffle(instances) - return (instances, sent_token_counter) - - -def create_instance_from_wiki_doc_pair(instance_id, doc_match_label, - doc_one_tokens, doc_two_tokens, - max_sent_length_by_word, - max_doc_length_by_sentence, - masked_lm_prob, max_predictions_per_seq, - vocab_words, rng): - """Creates `TrainingInstance`s for a WikiDocPair input data.""" - (tokens_1, segment_ids_1, masked_lm_positions_1, masked_lm_labels_1, \ - input_mask_1, masked_lm_weights_1) = \ - get_tokens_segment_ids_masks(max_sent_length_by_word, max_doc_length_by_sentence, doc_one_tokens, masked_lm_prob, - max_predictions_per_seq, vocab_words, rng) - (tokens_2, segment_ids_2, masked_lm_positions_2, masked_lm_labels_2, \ - input_mask_2, masked_lm_weights_2) = \ - get_tokens_segment_ids_masks(max_sent_length_by_word, max_doc_length_by_sentence, doc_two_tokens, masked_lm_prob, - max_predictions_per_seq, vocab_words, rng) - instance = TrainingInstance( - tokens_1=tokens_1, - segment_ids_1=segment_ids_1, - masked_lm_positions_1=masked_lm_positions_1, - masked_lm_labels_1=masked_lm_labels_1, - input_mask_1=input_mask_1, - masked_lm_weights_1=masked_lm_weights_1, - tokens_2=tokens_2, - segment_ids_2=segment_ids_2, - masked_lm_positions_2=masked_lm_positions_2, - masked_lm_labels_2=masked_lm_labels_2, - input_mask_2=input_mask_2, - masked_lm_weights_2=masked_lm_weights_2, - instance_id=instance_id, - documents_match_labels=doc_match_label) - return instance - - -def get_tokens_segment_ids_masks(max_sent_length_by_word, - max_doc_length_by_sentence, doc_one_tokens, - masked_lm_prob, max_predictions_per_seq, - vocab_words, rng): - """Get the tokens, segment ids and masks of an input sequence.""" - # The format of tokens for SMITH dual encoder models is like: - # [CLS] block1_token1 block1_token2 block1_token3 ... [SEP] [SEP] [PAD] ... - # [CLS] block2_token1 block2_token2 block2_token3 ... [SEP] [SEP] [PAD] ... - # [CLS] block3_token1 block3_token2 block3_token3 ... [SEP] [SEP] [PAD] ... - # If max_sent_length_by_word is large, then there will be many padded - # words in the sentence. Here we added an optional "greedy sentence filling" - # trick in order to reduce the number of padded words and maintain all - # content in the document. We allow a "sentence" block to contain more than - # one natural sentence and try to fill as many as sentences into the - # "sentence" block. If a sentence will be cut off and the current sentence - # block is not empty, we will put the sentence into the next "sentence" block. - # According to ALBERT paper and RoBERTa paper, a segment is usually comprised - # of more than one natural sentence, which has been shown to benefit - # performance. doc_one_tokens is a 2D list which contains the sentence - # boundary information. - sentence_num = len(doc_one_tokens) - # sent_block_token_list is a 2D list to maintain sentence block tokens. - sent_block_token_list = [] - natural_sentence_index = -1 - while natural_sentence_index + 1 < sentence_num: - natural_sentence_index += 1 - sent_tokens = doc_one_tokens[natural_sentence_index] - if not sent_tokens: - continue - if FLAGS.greedy_sentence_filling: - cur_sent_block_length = 0 - cur_sent_block = [] - # Fill as many senteces as possible in the current sentence block in a - # greedy way. - while natural_sentence_index < sentence_num: - cur_natural_sent_tokens = doc_one_tokens[natural_sentence_index] - if not cur_natural_sent_tokens: - natural_sentence_index += 1 - continue - cur_sent_len = len(cur_natural_sent_tokens) - if ((cur_sent_block_length + cur_sent_len) <= - (max_sent_length_by_word - 3)) or cur_sent_block_length == 0: - # One exceptional case here is that if the 1st sentence of a sentence - # block is already going across the boundary, then the current - # sentence block will be empty. So when cur_sent_block_length is 0 - # and we meet a natural sentence with length longer than - # (max_sent_length_by_word - 3), we still put this natural sentence - # in the current sentence block. In this case, this long natural - # sentence will be cut off with the final length up to - # (max_sent_length_by_word - 3). - cur_sent_block.extend(cur_natural_sent_tokens) - cur_sent_block_length += cur_sent_len - natural_sentence_index += 1 - else: - # If cur_sent_block_length + cur_sent_len > max_sent_length_by_word-3 - # and the current sentence block is not empty, the sentence which - # goes across the boundary will be put into the next sentence block. - natural_sentence_index -= 1 - break - sent_tokens = cur_sent_block - sent_block_token_list.append(sent_tokens) - if len(sent_block_token_list) >= max_doc_length_by_sentence: - break # Skip more sentence blocks if the document is too long. - # For each sentence block, generate the token sequences, masks and paddings. - tokens_doc = [] - segment_ids_doc = [] - masked_lm_positions_doc = [] - masked_lm_labels_doc = [] - input_mask_doc = [] - masked_lm_weights_doc = [] - for block_index in range(len(sent_block_token_list)): - tokens_block, segment_ids_block, masked_lm_positions_block, \ - masked_lm_labels_block, input_mask_block, masked_lm_weights_block = \ - get_token_masks_paddings( - sent_block_token_list[block_index], - max_sent_length_by_word, - masked_lm_prob, - max_predictions_per_seq, - vocab_words, - rng, - block_index) - tokens_doc.extend(tokens_block) - segment_ids_doc.extend(segment_ids_block) - masked_lm_positions_doc.extend(masked_lm_positions_block) - masked_lm_labels_doc.extend(masked_lm_labels_block) - input_mask_doc.extend(input_mask_block) - masked_lm_weights_doc.extend(masked_lm_weights_block) - - # Pad sentence blocks if the actual number of sentence blocks is less than - # max_doc_length_by_sentence. - sentence_block_index = len(sent_block_token_list) - while sentence_block_index < max_doc_length_by_sentence: - for _ in range(max_sent_length_by_word): - tokens_doc.append("[PAD]") - segment_ids_doc.append(0) - input_mask_doc.append(0) - for _ in range(max_predictions_per_seq): - masked_lm_positions_doc.append(0) - masked_lm_labels_doc.append("[PAD]") - masked_lm_weights_doc.append(0.0) - sentence_block_index += 1 - assert len(tokens_doc) == max_sent_length_by_word * max_doc_length_by_sentence - assert len(masked_lm_labels_doc - ) == max_predictions_per_seq * max_doc_length_by_sentence - return (tokens_doc, segment_ids_doc, masked_lm_positions_doc, - masked_lm_labels_doc, input_mask_doc, masked_lm_weights_doc) - - -def get_token_masks_paddings(block_tokens, max_sent_length_by_word, - masked_lm_prob, max_predictions_per_seq, - vocab_words, rng, block_index): - """Generates tokens, masks and paddings for the input block tokens.""" - # Account for [CLS], [SEP], [SEP] - max_num_tokens = max_sent_length_by_word - 3 - # Truncates the sequence if sequence length is longer than max_num_tokens. - tokens = [] - segment_ids = [] - if len(block_tokens) > max_num_tokens: - block_tokens = block_tokens[0:max_num_tokens] - tokens_a = block_tokens - tokens.append("[CLS]") - segment_ids.append(0) - for token in tokens_a: - tokens.append(token) - segment_ids.append(0) - tokens.append("[SEP]") - segment_ids.append(0) - tokens.append("[SEP]") - segment_ids.append(0) - masked_lm_positions = [] - masked_lm_labels = [] - masked_lm_weights = [] - if max_predictions_per_seq > 0: - (tokens, masked_lm_positions, - masked_lm_labels) = utils.create_masked_lm_predictions( - tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) - # Add [PAD] to tokens and masked LM related lists. - input_mask = [1] * len(tokens) - while len(tokens) < max_sent_length_by_word: - tokens.append("[PAD]") - input_mask.append(0) - segment_ids.append(0) - - assert len(tokens) == max_sent_length_by_word - assert len(input_mask) == max_sent_length_by_word - assert len(segment_ids) == max_sent_length_by_word - - if max_predictions_per_seq > 0: - # Transfer local positions in masked_lm_positions to global positions in the - # whole document to be consistent with the model training pipeline. - masked_lm_positions = [ - (i + max_sent_length_by_word * block_index) for i in masked_lm_positions - ] - masked_lm_weights = [1.0] * len(masked_lm_labels) - - while len(masked_lm_positions) < max_predictions_per_seq: - masked_lm_positions.append(0) - masked_lm_labels.append("[PAD]") - masked_lm_weights.append(0.0) - return (tokens, segment_ids, masked_lm_positions, masked_lm_labels, - input_mask, masked_lm_weights) - - -def main(_): - tf.logging.set_verbosity(tf.logging.INFO) - - tokenizer = tokenization.FullTokenizer( - vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) - - input_files = [] - for input_pattern in FLAGS.input_file.split(","): - input_files.extend(tf.gfile.Glob(input_pattern)) - - tf.logging.info("*** Reading from input files ***") - for input_file in input_files: - tf.logging.info(" %s", input_file) - rng = random.Random(FLAGS.random_seed) - # Creates training instances. - max_predictions_per_seq = FLAGS.max_predictions_per_seq if FLAGS.add_masks_lm else 0 - masked_lm_prob = FLAGS.masked_lm_prob if FLAGS.add_masks_lm else 0 - instances, sent_token_counter = create_training_instances_wiki_doc_pair( - input_file=FLAGS.input_file, - tokenizer=tokenizer, - max_sent_length_by_word=FLAGS.max_sent_length_by_word, - max_doc_length_by_sentence=FLAGS.max_doc_length_by_sentence, - masked_lm_prob=masked_lm_prob, - max_predictions_per_seq=max_predictions_per_seq, - rng=rng) - - output_files = FLAGS.output_file.split(",") - tf.logging.info("*** Writing to output files ***") - for output_file in output_files: - tf.logging.info(" %s", output_file) - - # Transfers training instances into tensorflow examples and write the results. - write_instance_to_example_files(instances, tokenizer, output_files) - - # Finally outputs some data statistics. - tf.logging.info("sent_count, token_count, doc_pair_count: %d %d %d", - sent_token_counter[0], sent_token_counter[1], len(instances)) - - -if __name__ == "__main__": - flags.mark_flag_as_required("input_file") - flags.mark_flag_as_required("output_file") - flags.mark_flag_as_required("vocab_file") - tf.app.run() - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py deleted file mode 100644 index f4b828c75..000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test_orig.py +++ /dev/null @@ -1,108 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from npu_bridge.npu_init import * -import random -import tempfile - -from absl import flags -import tensorflow.compat.v1 as tf - -from smith import preprocessing_smith -from smith.bert import tokenization - -FLAGS = flags.FLAGS - - -class PreprocessingSmithTest(tf.test.TestCase): - - def setUp(self): - super(PreprocessingSmithTest, self).setUp() - doc_one_text = ( - "I am in Dominick's for my dinner. OK, no problem. I am " - "in Dominick's for my dinner which is the best dinner I have " - "in my whole life.") - doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip() - vocab_tokens = [ - "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "i", "am", "in", "for", - "my", "dinner", "ok", "no", "problem", "which", "is", "the", "be", - "##s", "##t", "," - ] - with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: - vocab_writer.write("".join([x + "\n" for x in vocab_tokens - ]).encode("utf-8")) - self.vocab_file = vocab_writer.name - self.tokenizer = tokenization.FullTokenizer( - vocab_file=self.vocab_file, do_lower_case=True) - self.vocab_words = list(self.tokenizer.vocab.keys()) - self.rng = random.Random(12345) - self.doc_one_tokens, _ = preprocessing_smith.get_smith_model_tokens( - doc_one_text, self.tokenizer, [0, 0]) - self.max_sent_length_by_word = 20 - self.max_doc_length_by_sentence = 3 - self.greedy_sentence_filling = True - self.max_predictions_per_seq = 0 - self.masked_lm_prob = 0 - - def test_get_tokens_segment_ids_masks(self): - (tokens_1, segment_ids_1, _, _, input_mask_1, _) = \ - preprocessing_smith.get_tokens_segment_ids_masks( - max_sent_length_by_word=self.max_sent_length_by_word, - max_doc_length_by_sentence=self.max_doc_length_by_sentence, - doc_one_tokens=self.doc_one_tokens, - masked_lm_prob=self.masked_lm_prob, - max_predictions_per_seq=self.max_predictions_per_seq, - vocab_words=self.vocab_words, - rng=self.rng) - self.assertEqual(tokens_1, [ - "[CLS]", "i", "am", "in", "[UNK]", "[UNK]", "[UNK]", "for", "my", - "dinner", "[UNK]", "ok", ",", "no", "problem", "[UNK]", "[SEP]", - "[SEP]", "[PAD]", "[PAD]", "[CLS]", "i", "am", "in", "[UNK]", "[UNK]", - "[UNK]", "for", "my", "dinner", "which", "is", "the", "be", "##s", - "##t", "dinner", "i", "[SEP]", "[SEP]", "[PAD]", "[PAD]", "[PAD]", - "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", - "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", - "[PAD]" - ]) - self.assertEqual(segment_ids_1, [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - ]) - self.assertEqual(input_mask_1, [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - ]) - - -if __name__ == "__main__": - tf.test.main() - -- Gitee From 9c09b647738c4186473e3c703426223bbbafe145 Mon Sep 17 00:00:00 2001 From: QiuYao Date: Tue, 11 Oct 2022 19:51:23 +0800 Subject: [PATCH 3/3] update file --- .../nlp/smith_ID2025_for_ACL/README.md | 3 +- .../contrib/nlp/smith_ID2025_for_ACL/atc.sh | 3 +- .../nlp/smith_ID2025_for_ACL/ckpt2pb.py | 48 ++++++++++++++----- .../nlp/smith_ID2025_for_ACL/ckpt2pb.sh | 2 +- .../gen_bin_by_img2bin.sh | 3 +- .../contrib/nlp/smith_ID2025_for_ACL/msame.sh | 3 +- 6 files changed, 45 insertions(+), 17 deletions(-) diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md index f638d40a5..ccba07f62 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md @@ -109,4 +109,5 @@ python3 img2bin.py -i ./input_mask_2.txt -t int32 -o ./out/ {"predicted_score": "0.5", "predicted_class": "0.0"} {"predicted_score": "0.5", "predicted_class": "0.0"} {"predicted_score": "0.9975251", "predicted_class": "1.0"} -{"predicted_score": "0.99752605", "predicted_class": "1.0"} \ No newline at end of file +{"predicted_score": "0.99752605", "predicted_class": "1.0"} + diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh index b7c416779..6d15ed595 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh @@ -1 +1,2 @@ -atc --model=smith.pb --framework=3 --output=pb_res --soc_version=Ascend910 --input_shape="input_ids_1:32,2048;input_mask_1:32,2048;input_ids_2:32,2048;input_mask_2:32,2048" --out_nodes="seq_rep_from_bert_doc_dense/l2_normalize_1:0;Sigmoid:0;Round:0" --log=debug \ No newline at end of file +atc --model=smith.pb --framework=3 --output=pb_res --soc_version=Ascend910 --input_shape="input_ids_1:32,2048;input_mask_1:32,2048;input_ids_2:32,2048;input_mask_2:32,2048" --out_nodes="seq_rep_from_bert_doc_dense/l2_normalize_1:0;Sigmoid:0;Round:0" --log=debug + diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py index 8d8140ab6..a65ffcb51 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py @@ -1,9 +1,33 @@ -# -*- coding: utf-8 -*- -""" - Created on 2022/4/21 0:18 - - @Author T.c -""" +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + from absl import flags from absl import app import tensorflow.compat.v1 as tf @@ -16,17 +40,19 @@ from tensorflow.python.framework import graph_util FLAGS = flags.FLAGS flags.DEFINE_string("dual_encoder_config_file", None, "The proto config file for dual encoder SMITH models.") +flags.DEFINE_string("ckpt_path", None, "The NPU ckpt file.") +flags.DEFINE_string("output_graph", "smith.pb", "The output path of pb file.") -# 指定checkpoint路径 -ckpt_path = "/home/test_user06/tc_workspace/data/result_file/tc_wsp_20220920_V4/model.ckpt-10000" +ckpt_path = FLAGS.ckpt_path +output_graph = FLAGS.output_graph def main(_argv): input_ids_1 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_ids_1") - input_mask_1 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_1") #features["input_mask_1"] + input_mask_1 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_1") input_ids_2 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_ids_2") - input_mask_2 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_2") #features["input_mask_2"] + input_mask_2 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_2") exp_config = utils.load_config_from_file(FLAGS.dual_encoder_config_file, experiment_config_pb2.DualEncoderConfig()) tf.logging.info("*** Features ***") masked_lm_positions_1 = tf.zeros([1]) @@ -55,8 +81,6 @@ def main(_argv): graph = tf.get_default_graph() input_graph_def = graph.as_graph_def() - output_graph = "/home/test_user06/tc_workspace/smith_0927_del_full_dropout_27_NPU.pb" - with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh index f4b954047..077429966 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh @@ -1 +1 @@ -python3 ckpt2pb.py --dual_encoder_config_file=smith/config/dual_encoder_config.smith_wsp.32.48.pbtxt \ No newline at end of file +python3 ckpt2pb.py --dual_encoder_config_file=smith/config/dual_encoder_config.smith_wsp.32.48.pbtxt --ckpt_path=./model.ckpt-10000 --output_graph=./smith.pb diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh index a148192ea..7a74ffa30 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh @@ -1,4 +1,5 @@ python3 img2bin.py -i ./input_ids_1.txt -t int32 -o ./out/ python3 img2bin.py -i ./input_ids_2.txt -t int32 -o ./out/ python3 img2bin.py -i ./input_mask_1.txt -t int32 -o ./out/ -python3 img2bin.py -i ./input_mask_2.txt -t int32 -o ./out/ \ No newline at end of file +python3 img2bin.py -i ./input_mask_2.txt -t int32 -o ./out/ + diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh index f9f5f37d5..b6b983348 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh @@ -1 +1,2 @@ -./msame --model "pb_res.om" --input "out/out4tmp/input_ids_1.bin,out/out4tmp/input_ids_2.bin,out/out4tmp/input_mask_1.bin,out/out4tmp/input_mask_2.bin" --output "output" --loop 1 --outfmt TXT --debug true \ No newline at end of file +./msame --model "pb_res.om" --input "out/out4tmp/input_ids_1.bin,out/out4tmp/input_ids_2.bin,out/out4tmp/input_mask_1.bin,out/out4tmp/input_mask_2.bin" --output "output" --loop 1 --outfmt TXT --debug true + -- Gitee