diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md index f638d40a54b5ea4cec4066e58bd1b7bc2cd8847e..73d3befa04145086db50262d7040895ae7ffaa9b 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/README.md @@ -4,6 +4,8 @@ * TensorFlow 1.15.0 * Python 3.7.0 +## 迁移部分代码[地址](https://gitee.com/ascend/ModelZoo-TensorFlow/pulls/790) + ## 代码及路径解释 ``` smith_ID2025_for_ACL @@ -109,4 +111,5 @@ python3 img2bin.py -i ./input_mask_2.txt -t int32 -o ./out/ {"predicted_score": "0.5", "predicted_class": "0.0"} {"predicted_score": "0.5", "predicted_class": "0.0"} {"predicted_score": "0.9975251", "predicted_class": "1.0"} -{"predicted_score": "0.99752605", "predicted_class": "1.0"} \ No newline at end of file +{"predicted_score": "0.99752605", "predicted_class": "1.0"} + diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh index b7c4167793644ec43779a418451310ad60f384a2..6d15ed595a9391e4fc8e44bbb095124ae9d7ac9f 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/atc.sh @@ -1 +1,2 @@ -atc --model=smith.pb --framework=3 --output=pb_res --soc_version=Ascend910 --input_shape="input_ids_1:32,2048;input_mask_1:32,2048;input_ids_2:32,2048;input_mask_2:32,2048" --out_nodes="seq_rep_from_bert_doc_dense/l2_normalize_1:0;Sigmoid:0;Round:0" --log=debug \ No newline at end of file +atc --model=smith.pb --framework=3 --output=pb_res --soc_version=Ascend910 --input_shape="input_ids_1:32,2048;input_mask_1:32,2048;input_ids_2:32,2048;input_mask_2:32,2048" --out_nodes="seq_rep_from_bert_doc_dense/l2_normalize_1:0;Sigmoid:0;Round:0" --log=debug + diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py index 8d8140ab61244f12948a2199b35ea6952b7e9174..beb57e7817e48ebade1569a54c1755dd82793f8d 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.py @@ -1,9 +1,33 @@ -# -*- coding: utf-8 -*- -""" - Created on 2022/4/21 0:18 - - @Author T.c -""" +# coding=utf-8 +# Copyright 2021 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + from absl import flags from absl import app import tensorflow.compat.v1 as tf @@ -16,17 +40,19 @@ from tensorflow.python.framework import graph_util FLAGS = flags.FLAGS flags.DEFINE_string("dual_encoder_config_file", None, "The proto config file for dual encoder SMITH models.") +flags.DEFINE_string("ckpt_path", None, "The NPU ckpt file.") +flags.DEFINE_string("output_graph", "smith.pb", "The output path of pb file.") -# 指定checkpoint路径 -ckpt_path = "/home/test_user06/tc_workspace/data/result_file/tc_wsp_20220920_V4/model.ckpt-10000" +ckpt_path = FLAGS.ckpt_path +output_graph = FLAGS.output_graph def main(_argv): input_ids_1 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_ids_1") - input_mask_1 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_1") #features["input_mask_1"] + input_mask_1 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_1") input_ids_2 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_ids_2") - input_mask_2 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_2") #features["input_mask_2"] + input_mask_2 = tf.placeholder(tf.int32, shape=(32, 2048), name="input_mask_2") exp_config = utils.load_config_from_file(FLAGS.dual_encoder_config_file, experiment_config_pb2.DualEncoderConfig()) tf.logging.info("*** Features ***") masked_lm_positions_1 = tf.zeros([1]) @@ -55,8 +81,6 @@ def main(_argv): graph = tf.get_default_graph() input_graph_def = graph.as_graph_def() - output_graph = "/home/test_user06/tc_workspace/smith_0927_del_full_dropout_27_NPU.pb" - with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() @@ -64,7 +88,7 @@ def main(_argv): output_graph_def = graph_util.convert_variables_to_constants( sess=sess, input_graph_def=input_graph_def, - output_node_names=["seq_rep_from_bert_doc_dense/l2_normalize_1","Sigmoid","Round"]) + output_node_names=["seq_rep_from_bert_doc_dense/l2_normalize_1", "Sigmoid", "Round"]) with tf.gfile.GFile(output_graph, "wb") as f: f.write(output_graph_def.SerializeToString()) diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh index f4b954047532cc10c9680406ad533e55d14b8541..077429966cc6cf986cacec58d5b716fe080d6e5d 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/ckpt2pb.sh @@ -1 +1 @@ -python3 ckpt2pb.py --dual_encoder_config_file=smith/config/dual_encoder_config.smith_wsp.32.48.pbtxt \ No newline at end of file +python3 ckpt2pb.py --dual_encoder_config_file=smith/config/dual_encoder_config.smith_wsp.32.48.pbtxt --ckpt_path=./model.ckpt-10000 --output_graph=./smith.pb diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh index a148192ea0e80f03acd22318d4b97694ba08e3d5..7a74ffa30a2ea4313c47066ed3683c36e96a7c94 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/gen_bin_by_img2bin.sh @@ -1,4 +1,5 @@ python3 img2bin.py -i ./input_ids_1.txt -t int32 -o ./out/ python3 img2bin.py -i ./input_ids_2.txt -t int32 -o ./out/ python3 img2bin.py -i ./input_mask_1.txt -t int32 -o ./out/ -python3 img2bin.py -i ./input_mask_2.txt -t int32 -o ./out/ \ No newline at end of file +python3 img2bin.py -i ./input_mask_2.txt -t int32 -o ./out/ + diff --git a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh index f9f5f37d506474454a8df557824a685fb1580d7b..b6b98334892faa63c9facff86293b852e3ab37c3 100644 --- a/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh +++ b/ACL_TensorFlow/contrib/nlp/smith_ID2025_for_ACL/msame.sh @@ -1 +1,2 @@ -./msame --model "pb_res.om" --input "out/out4tmp/input_ids_1.bin,out/out4tmp/input_ids_2.bin,out/out4tmp/input_mask_1.bin,out/out4tmp/input_mask_2.bin" --output "output" --loop 1 --outfmt TXT --debug true \ No newline at end of file +./msame --model "pb_res.om" --input "out/out4tmp/input_ids_1.bin,out/out4tmp/input_ids_2.bin,out/out4tmp/input_mask_1.bin,out/out4tmp/input_mask_2.bin" --output "output" --loop 1 --outfmt TXT --debug true + diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/.keep b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/.keep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/README.md b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/README.md deleted file mode 100644 index e88fe4a15290d277e8163ce0bd7360b388435597..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/README.md +++ /dev/null @@ -1,194 +0,0 @@ -- [基本信息](#基本信息.md) -- [概述](#概述.md) -- [训练环境准备](#训练环境准备.md) -- [快速上手](#快速上手.md) -

基本信息

-**发布者(Publisher):Huawei** - -**应用领域(Application Domain):Natural Language Processing** - -**版本(Version):1.1** - -**修改时间(Modified) :2022.09.02** - -**大小(Size):2M** - -**框架(Framework):TensorFlow 1.15.0** - -**模型格式(Model Format):ckpt** - -**精度(Precision):Mixed** - -**处理器(Processor):昇腾910** - -**应用级别(Categories):Official** - -**描述(Description):基于TensorFlow框架的smith代码** - -

概述

- 许多自然语言处理和信息检索问题可以形式化为语义匹配任务。以往的工作主要集中在短文本之间的匹配或短文本和长文本之间的匹配。长篇文档之间的语义匹配在新闻推荐、相关文章推荐和文档聚类等许多重要应用中的应用相对较少,需要更多的研究工作。这项工作通过提出用于长格式文档匹配的Siamese Multi-depth Transformer-based Hierarchical (SMITH) 编码器来解决这个问题。 - -- 参考论文: - https://dl.acm.org/doi/abs/10.1145/3340531.3411908 - -- 参考实现: - https://github.com/google-research/google-research/tree/master/smith - -- 适配昇腾 AI 处理器的实现: - https://gitee.com/ascend/ModelZoo-TensorFlow/tree/master/TensorFlow/contrib/nlp/smith_ID2025_for_TensorFlow - - -## 默认配置 - -- 训练超参(单卡): - - Learning rate(LR): 5e-05 - - Batch size: 32 - - num_train_steps: 10000 - - num_warmup_steps: 1000 - -## 混合精度训练 -昇腾910 AI处理器提供自动混合精度功能,可以针对全网中float32数据类型的算子,按照内置的优化策略,自动将部分float32的算子降低精度到float16,从而在精度损失很小的情况下提升系统性能并减少内存使用。 - -## 开启混合精度 -``` -session_config = tf.ConfigProto(allow_soft_placement=True) -run_config = NPURunConfig( - session_config=session_config, - model_dir=FLAGS.output_dir, - save_checkpoints_steps=exp_config.train_eval_config.save_checkpoints_steps, - iterations_per_loop=exp_config.train_eval_config.iterations_per_loop, - precision_mode='allow_mix_precision', - hcom_parallel=True -) -``` - -

训练环境准备

- -- 硬件环境和运行环境准备请参见《[CANN软件安装指南](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373?category=installation-update)》 -- 运行以下命令安装依赖。 - ``` - pip3 install requirements.txt - ``` -说明:1.依赖配置文件requirements.txt文件位于模型的根目录 - 2.根据原始的[git地址](https://github.com/google-research/google-research/tree/master/smith)下载nltk的数据 - ``` - import nltk - nltk.download('punkt') - ``` - 3.根据原始git地址进行各模块的流程测试 - ``` - python -m smith.loss_fns_test - python -m smith.metric_fns_test - python -m smith.modeling_test - python -m smith.preprocessing_smith_test - ``` -备;硬件环境准备请参见各硬件产品文档"[驱动和固件安装升级指南]( https://support.huawei.com/enterprise/zh/category/ai-computing-platform-pid-1557196528909)"。需要在硬件设备上安装与CANN版本配套的固件与驱动。 - -

快速上手

-## 数据集准备 -1. 模型使用数据集gwikimatch[介绍](https://github.com/google-research/google-research/tree/master/gwikimatch),数据路径[下载](http://storage.googleapis.com/gresearch/smith_gwikimatch/README.md)。请用户自行准备好数据集,数据集实际可用的为样例tfrecord(本次迁移所使用) -2. 模型使用分为2种方式,即:SMITH-Short方式下使用bert官方预训练模型uncased_L-12_H-768_A-12以及SMITH-WP+SP方式下使用作者预训练smith_pretrain_model_ckpts。下方"训练部分"提供路径下载。 -3. 使用protobuf工具将原作者提供的wiki_doc_pair.proto及experiment_config.proto转成wiki_doc_pair_pb2.py和experiment_config_pb2.py(已完成,可直接使用。具体过程见原GitHub的README.md) -4. 数据集转换说明,正式训练前需使用smith/preprocessing_smith.py将原始的训练集(small_demo_data.external_wdp.filtered_contro_wiki_cc_team.tfrecord)做预处理。然后作为模型训练的输入。 - 执行脚本 preprocessing_smith.sh(需配置`DATA_PATH`执行路径参数) - ``` - source ~/env.sh - DATA_PATH="../data" - DATA_PATH_OUT="../data/output_file" - if [ ! -d "${DATA_PATH_OUT}" ]; then - mkdir ${DATA_PATH_OUT} - fi - python3 preprocessing_smith.py --input_file=${DATA_PATH}/input_file/small_demo_data.external_wdp.filtered_contro_wiki_cc_team.tfrecord --output_file=${DATA_PATH}/output_file/smith_train_sample_input.tfrecord --vocab_file=${DATA_PATH}/uncased_L-12_H-768_A-12/vocab.txt - ``` - -## 模型训练 - -- 训练部分 - 1. 启动训练之前,首先要配置程序运行相关环境变量。 - 环境变量配置信息如下: - ``` - source ~/env.sh - export JOB_ID=10001 - export ASCEND_DEVICE_ID= 1 # 根据实际npu-smi info查看到的npu使用情况定 - ``` - - 2. 配置smith/config/dual_encoder_config.smith_wsp.32.48.pbtxt中的网络超参数,主要指定预训练模型和数据输入的路径(上述3的输出)。 - ``` - init_checkpoint # 预训练模型的ckpt路径 - bert_config_file # 配置config/sent_bert_4l_config.json - doc_bert_config_file # 配置config/doc_bert_3l_256h_config.json - vocab_file # 配置bert官方uncased_L-12_H-768_A-12/vocab.txt - input_file_for_train # 配置数据预处理的output_file - input_file_for_eval # 配置数据预处理的output_file (该代码由于原始数据无法获取,训练和验证使用同一套作者所提供样例tfrecord数据集) - ``` - 下载依赖模型文件: - bert预训练模型[下载](https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip) - 作者提供wsp训练依赖预训练模型[smith_wsp_pretrain_ckpt_opensource](http://storage.googleapis.com/gresearch/smith_gwikimatch/smith_wsp_pretrain_ckpt_opensource.zip) - - 3. 单卡训练 - 单卡训练指令如下(smith/bash/train_eval_predict_export/train_wsp.sh) - 注意配置eval_wsp.sh中的参数,'DATA_PATH', 'CODE_PATH' 设置为自己的路径(下同); - ``` - cd smith/bash - bash train_wsp.sh - ``` - 训练完成后ckpt文件[路径](https://e-share.obs-website.cn-north-1.myhuaweicloud.com?token=OVWrEUKDR1ADCmJjkBGQ3htGaizfvxlSf9IlnYwV2/viY4ioin+PR2KRtwadLKxE1gt4UUhppUs5f/woysQbGg9EGEcwk+17FznTTlVaDwMq/+IOPf44FDjQSSDcfB80gaA9iw1wn0I54iM5Ay3J2PDUDpr9bDe3faMdnnFv05ROdjODdyWRuoJPBK4YRNwwGAEJ7qhjWbF0IbVkmzXKPMkfdE9/KdHhSrOkGHtKcBFr7Ng1T/37noZ6kNMj8dGYofVLwMcdR51fm6hmlCbnC9jA0y9xyKD6TJPno0O+WFMpYos4IbhHPec1EWau0MY0+iMU2HTQTURNnMIfp28oR+TH2uM3RTV7kXNZMFijcKtX7Nxn6yVMOx4Fo0ycWImWRbBQUIKNLGCeD2XcMB++5tYV6y8LdzBOyQEGC/i1iYuQ/K+r1/IYDkdy59FZcZ/C/LV8tYe1u+I5F4eWe2tuguhK1qGRmBusr/StF7hwnl0xSbOY5hv3mkUczWH8bRA9zSlQ5C4ZzUExK2lok0qldw==) - 密码:111111 - -- 验证和推理部分 - 1. 执行eval_wsp.sh,验证最后的打屏显示出最终的accuracy/precision/recall指标 - ``` - cd smith/bash/train_eval_predict_export - bash eval_wsp.sh - ``` - 精度对比说明:通过比对GPU复现输出精度与NPU对比一致 - accuracy = 1.0 precision = 1.0 recall = 1.0 - - 2. 执行predict_wsp.sh,验证输出的prediction_results.json - ``` - cd smith/bash/train_eval_predict_export - bash predict_wsp.sh - ``` - 备:在预测的结果prediction_results.json 在--output_dir的路径下 - - 3. 执行export_wsp.sh - ``` - cd smith/bash/train_eval_predict_export - bash export_wsp.sh - ``` - -

高级参考

-## 脚本和示例代码 - -``` - -└─smith - ├─README.md - ├─config - ├─bert_config.json - └─doc_bert_3l_256h_config.json... - └─doc_bert_3l_768h_config.json - └─dual_encoder_config.smith_short.32.8.pbtxt - └─dual_encoder_config.smith_wsp.32.48.pbtxt - ... - ├─bert - |─modeling.py - └─optimization.py - ├─input_fns.py - ├─layers.py - ... - ├─run_smith.py - └─modeling.py -``` - -## 脚本参数 - -``` -#Training ---dual_encoder_config_file config for train and eval. ---output_dir the output for ckpt. ---train_mode finetune. ---num_train_steps the step of train. ---num_warmup_steps the Patience factor of lr. ---schedule train/eval/predict/export. -``` \ No newline at end of file diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/__init__.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/__init__.py deleted file mode 100644 index c4cbefc3397c8c691234e616369bda8b71f721a6..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/preprocess/preprocessing_smith.sh b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/preprocess/preprocessing_smith.sh deleted file mode 100644 index 6d9af3cfef37f99ac1804d6a4e83188c1c9e3872..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/preprocess/preprocessing_smith.sh +++ /dev/null @@ -1,8 +0,0 @@ -source ~/env.sh - -DATA_PATH_OUT="../data/output_file" -if [ ! -d "${DATA_PATH_OUT}" ]; then - mkdir ${DATA_PATH_OUT} -fi - -python3 preprocessing_smith.py --input_file=${DATA_PATH}/input_file/small_demo_data.external_wdp.filtered_contro_wiki_cc_team.tfrecord --output_file=${DATA_PATH}/output_file/smith_train_sample_input.tfrecord --vocab_file=${DATA_PATH}/uncased_L-12_H-768_A-12/vocab.txt \ No newline at end of file diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/protobuf/protobuf.sh b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/protobuf/protobuf.sh deleted file mode 100644 index a6ad92886eb1ec743e0b3fe3f569fdf5f93eba7e..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/protobuf/protobuf.sh +++ /dev/null @@ -1,4 +0,0 @@ -PB_REL="https://github.com/protocolbuffers/protobuf/releases" -curl -LO \$PB_REL/download/v3.13.0/protoc-3.13.0-linux-x86_64.zip -unzip protoc-3.13.0-linux-x86_64.zip -d \$HOME/.local -export PATH="\$PATH:\$HOME/.local/bin" diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/eval_wsp.sh b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/eval_wsp.sh deleted file mode 100644 index fa88592b3d464874fb8643e721443ef00c1c9721..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/eval_wsp.sh +++ /dev/null @@ -1,3 +0,0 @@ -CODE_PATH="./smith" -DATA_PATH="../data" -nohup python3 -m smith.run_smith --dual_encoder_config_file=${CODE_PATH}/config/dual_encoder_config.smith_wsp.32.48.pbtxt --output_dir=${DATA_PATH}/result_file/train_wsp_20220904/ --train_mode=finetune --num_train_steps=10000 --num_warmup_steps=1000 --schedule=continuous_eval >> eval_0904.log 2>&1 & \ No newline at end of file diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/export_wsp.sh b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/export_wsp.sh deleted file mode 100644 index 6bbbd621c496024a71b635a8ff9c9480707b292b..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/export_wsp.sh +++ /dev/null @@ -1,3 +0,0 @@ -CODE_PATH="./smith" -DATA_PATH="../data" -nohup python3 -m smith.run_smith --dual_encoder_config_file=${CODE_PATH}/config/dual_encoder_config.smith_wsp.32.48.pbtxt --output_dir=${DATA_PATH}/result_file/train_wsp_20220904/ --train_mode=finetune --num_train_steps=10000 --num_warmup_steps=1000 --schedule=export >> export_0904.log 2>&1 & \ No newline at end of file diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/predict_wsp.sh b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/predict_wsp.sh deleted file mode 100644 index ad5048df003d3aaafd317e90fb5dc05517a6914c..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/predict_wsp.sh +++ /dev/null @@ -1,3 +0,0 @@ -CODE_PATH="./smith" -DATA_PATH="../data" -nohup python3 -m smith.run_smith --dual_encoder_config_file=${CODE_PATH}/config/dual_encoder_config.smith_wsp.32.48.pbtxt --output_dir=${DATA_PATH}/result_file/train_wsp_20220904/ --train_mode=finetune --num_train_steps=10000 --num_warmup_steps=1000 --schedule=predict >> predict_0904.log 2>&1 & \ No newline at end of file diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/train_wsp.sh b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/train_wsp.sh deleted file mode 100644 index 8ce2b5df00aa3548e84e5705b6276474c886a2f1..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bash/train_eval_predict_export/train_wsp.sh +++ /dev/null @@ -1,3 +0,0 @@ -CODE_PATH="./smith" -DATA_PATH="../data" -nohup python3 -m smith.run_smith --dual_encoder_config_file=${CODE_PATH}/config/dual_encoder_config.smith_wsp.32.48.pbtxt --output_dir=${DATA_PATH}/result_file/train_wsp_20220904/ --train_mode=finetune --num_train_steps=10000 --num_warmup_steps=1000 --schedule=train >> train_0904.log 2>&1 & \ No newline at end of file diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling.py deleted file mode 100644 index 1ca7d412483106b44e12b0303a99aa11112c0e99..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling.py +++ /dev/null @@ -1,1286 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""The main BERT model and related functions.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * - -import collections -import copy -import json -import math -import re -import numpy as np -import six -from six.moves import range -import tensorflow.compat.v1 as tf -import tf_slim as slim - - -class BertConfig(object): - """Configuration for `BertModel`.""" - - def __init__(self, - vocab_size, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - initializer_range=0.02): - """Constructs BertConfig. - - Args: - vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. - hidden_size: Size of the encoder layers and the pooler layer. - num_hidden_layers: Number of hidden layers in the Transformer encoder. - num_attention_heads: Number of attention heads for each attention layer in - the Transformer encoder. - intermediate_size: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. - hidden_dropout_prob: The dropout probability for all fully connected - layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size: The vocabulary size of the `token_type_ids` passed into - `BertModel`. - initializer_range: The stdev of the truncated_normal_initializer for - initializing all weight matrices. - """ - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - - @classmethod - def from_dict(cls, json_object): - """Constructs a `BertConfig` from a Python dictionary of parameters.""" - config = BertConfig(vocab_size=None) - for (key, value) in six.iteritems(json_object): - config.__dict__[key] = value - return config - - @classmethod - def from_json_file(cls, json_file): - """Constructs a `BertConfig` from a json file of parameters.""" - with tf.gfile.GFile(json_file, "r") as reader: - text = reader.read() - return cls.from_dict(json.loads(text)) - - def to_dict(self): - """Serializes this instance to a Python dictionary.""" - output = copy.deepcopy(self.__dict__) - return output - - def to_json_string(self): - """Serializes this instance to a JSON string.""" - return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" - - -class DocBertModel(object): - """DocBERT model ("The Doc Level BERT Encoder"). - - This is the adapted version to encode a long form document based on a - sequence of sentence or passage representations. - - Example usage: - - ```python - # Already learned the representations for each sentence or passage - # The shape of input_reps: [batch, doc_len_by_sentence, hidden] - # The shape of input_mask: [batch, doc_len_by_sentence] - # 0 for paded positions. 1 for real positions. - input_reps = tf.constant([[[0.1,0.2], [0.1,0.2], [0.1,0.2]], [[0.2,0.3], - [0.2,0.3], [0.2,0.3]]]) - input_mask = tf.constant([[1, 1, 0], [1, 1, 0]]) - # token_type_ids is optional. - - config = modeling.BertConfig(hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) - - model = modeling.DocBertModel(config=config, is_training=True, - input_reps=input_reps, input_mask=input_mask) - - label_embeddings = tf.get_variable(...) - pooled_output = model.get_pooled_output() - logits = tf.matmul(pooled_output, label_embeddings) - ... - ``` - """ - - def __init__(self, - config, - is_training, - input_reps, - input_mask=None, - token_type_ids=None, - scope=None): - """Constructor for DocBertModel. - - Args: - config: `BertConfig` instance. - is_training: bool. true for training model, false for eval model. Controls - whether dropout will be applied. - input_reps: float32 Tensor of shape [batch_size, seq_length, hidden]. - input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. - token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. - scope: (optional) variable scope. Defaults to "doc_bert". - - Raises: - ValueError: The config is invalid or one of the input tensor shapes - is invalid. - """ - config = copy.deepcopy(config) - if not is_training: - config.hidden_dropout_prob = 0.0 - config.attention_probs_dropout_prob = 0.0 - - input_shape = get_shape_list(input_reps, expected_rank=3) - batch_size = input_shape[0] - seq_length = input_shape[1] - - if input_mask is None: - input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) - - if token_type_ids is None: - token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) - - with tf.variable_scope("doc_bert", scope, reuse=tf.AUTO_REUSE): - with tf.variable_scope("doc_embeddings"): - - # In the context of doc_bert, the word embedding can mean the embedding - # for a sentence or a passage in a document. - self.sentence_embedding_output = input_reps - - # Add positional embeddings and token type embeddings, then layer - # normalize and perform dropout. - self.embedding_output = embedding_postprocessor( - input_tensor=self.sentence_embedding_output, - use_token_type=False, - use_position_embeddings=True, - position_embedding_name="position_embeddings", - initializer_range=config.initializer_range, - max_position_embeddings=config.max_position_embeddings, - dropout_prob=config.hidden_dropout_prob) - - with tf.variable_scope("doc_encoder"): - # This converts a 2D mask of shape [batch_size, seq_length] to a 3D - # mask of shape [batch_size, seq_length, seq_length] which is used - # for the attention scores. - # Get batch_size and from_seq_length from input_reps. - attention_mask = create_attention_mask_from_input_mask_doc( - batch_size, seq_length, input_mask) - - # Run the stacked transformer. - # `sequence_output` shape = [batch_size, seq_length, hidden_size]. - self.all_encoder_layers = transformer_model( - input_tensor=self.embedding_output, - attention_mask=attention_mask, - hidden_size=config.hidden_size, - num_hidden_layers=config.num_hidden_layers, - num_attention_heads=config.num_attention_heads, - intermediate_size=config.intermediate_size, - intermediate_act_fn=get_activation(config.hidden_act), - hidden_dropout_prob=config.hidden_dropout_prob, - attention_probs_dropout_prob=config.attention_probs_dropout_prob, - initializer_range=config.initializer_range, - do_return_all_layers=True) - - self.sequence_output = self.all_encoder_layers[-1] - # The "pooler" converts the encoded sequence tensor of shape - # [batch_size, seq_length, hidden_size] to a tensor of shape - # [batch_size, hidden_size]. This is necessary for segment-level - # (or segment-pair-level) classification tasks where we need a fixed - # dimensional representation of the segment. - with tf.variable_scope("doc_pooler"): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. We assume that this has been pre-trained. - first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) - self.pooled_output = tf.layers.dense( - first_token_tensor, - config.hidden_size, - activation=tf.tanh, - kernel_initializer=create_initializer(config.initializer_range)) - - def get_pooled_output(self): - return self.pooled_output - - def get_sequence_output(self): - """Gets final hidden layer of encoder. - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size] corresponding - to the final hidden of the transformer encoder. - """ - return self.sequence_output - - def get_all_encoder_layers(self): - return self.all_encoder_layers - - def get_sentence_embedding_output(self): - """Get the sentence level embedding representations. - - This is BEFORE positional embeddings and token type embeddings have been - added. - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size] - """ - return self.sentence_embedding_output - - def get_embedding_output(self): - """Gets output of the embedding lookup (i.e., input to the transformer). - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size] corresponding - to the output of the embedding layer, after summing the word - embeddings with the positional embeddings and the token type embeddings, - then performing layer normalization. This is the input to the transformer. - """ - return self.embedding_output - - def get_embedding_table(self): - return self.embedding_table - - -class BertModel(object): - """BERT model ("Bidirectional Encoder Representations from Transformers"). - - Example usage: - - ```python - # Already been converted into WordPiece token ids - input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) - input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) - token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) - - config = modeling.BertConfig(vocab_size=32000, hidden_size=512, - num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) - - model = modeling.BertModel(config=config, is_training=True, - input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) - - label_embeddings = tf.get_variable(...) - pooled_output = model.get_pooled_output() - logits = tf.matmul(pooled_output, label_embeddings) - ... - ``` - """ - - def __init__(self, - config, - is_training, - input_ids, - input_mask=None, - token_type_ids=None, - use_one_hot_embeddings=False, - sent_bert_trainable=True, - scope=None): - """Constructor for BertModel. - - Args: - config: `BertConfig` instance. - is_training: bool. true for training model, false for eval model. Controls - whether dropout will be applied. - input_ids: int32 Tensor of shape [batch_size, seq_length]. - input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. - token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. - use_one_hot_embeddings: (optional) bool. Whether to use one-hot word - embeddings or tf.embedding_lookup() for the word embeddings. - sent_bert_trainable: (optional) bool. Whether to update the parameters of - the bert model. - scope: (optional) variable scope. Defaults to "bert". - - Raises: - ValueError: The config is invalid or one of the input tensor shapes - is invalid. - """ - config = copy.deepcopy(config) - if not is_training: - config.hidden_dropout_prob = 0.0 - config.attention_probs_dropout_prob = 0.0 - - input_shape = get_shape_list(input_ids, expected_rank=2) - batch_size = input_shape[0] - seq_length = input_shape[1] - - if input_mask is None: - input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) - - if token_type_ids is None: - token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) - - with tf.variable_scope("bert", scope, reuse=tf.AUTO_REUSE): - with tf.variable_scope("embeddings"): - # Perform embedding lookup on the word ids. - (self.word_embedding_output, self.embedding_table) = embedding_lookup( - input_ids=input_ids, - vocab_size=config.vocab_size, - embedding_size=config.hidden_size, - initializer_range=config.initializer_range, - word_embedding_name="word_embeddings", - use_one_hot_embeddings=use_one_hot_embeddings, - trainable=sent_bert_trainable) - - # Add positional embeddings and token type embeddings, then layer - # normalize and perform dropout. - self.embedding_output = embedding_postprocessor( - input_tensor=self.word_embedding_output, - use_token_type=True, - token_type_ids=token_type_ids, - token_type_vocab_size=config.type_vocab_size, - token_type_embedding_name="token_type_embeddings", - use_position_embeddings=True, - position_embedding_name="position_embeddings", - initializer_range=config.initializer_range, - max_position_embeddings=config.max_position_embeddings, - dropout_prob=config.hidden_dropout_prob, - trainable=sent_bert_trainable) - - with tf.variable_scope("encoder"): - # This converts a 2D mask of shape [batch_size, seq_length] to a 3D - # mask of shape [batch_size, seq_length, seq_length] which is used - # for the attention scores. - attention_mask = create_attention_mask_from_input_mask( - input_ids, input_mask) - - # Run the stacked transformer. - # `sequence_output` shape = [batch_size, seq_length, hidden_size]. - self.all_encoder_layers = transformer_model( - input_tensor=self.embedding_output, - attention_mask=attention_mask, - hidden_size=config.hidden_size, - num_hidden_layers=config.num_hidden_layers, - num_attention_heads=config.num_attention_heads, - intermediate_size=config.intermediate_size, - intermediate_act_fn=get_activation(config.hidden_act), - hidden_dropout_prob=config.hidden_dropout_prob, - attention_probs_dropout_prob=config.attention_probs_dropout_prob, - initializer_range=config.initializer_range, - do_return_all_layers=True, - trainable=sent_bert_trainable) - - self.sequence_output = self.all_encoder_layers[-1] - # The "pooler" converts the encoded sequence tensor of shape - # [batch_size, seq_length, hidden_size] to a tensor of shape - # [batch_size, hidden_size]. This is necessary for segment-level - # (or segment-pair-level) classification tasks where we need a fixed - # dimensional representation of the segment. - with tf.variable_scope("pooler"): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. We assume that this has been pre-trained - first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) - self.pooled_output = tf.layers.dense( - first_token_tensor, - config.hidden_size, - activation=tf.tanh, - kernel_initializer=create_initializer(config.initializer_range), - trainable=sent_bert_trainable) - - def get_pooled_output(self): - return self.pooled_output - - def get_sequence_output(self): - """Gets final hidden layer of encoder. - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size] corresponding - to the final hidden of the transformer encoder. - """ - return self.sequence_output - - def get_all_encoder_layers(self): - return self.all_encoder_layers - - def get_word_embedding_output(self): - """Get output of the word(piece) embedding lookup. - - This is BEFORE positional embeddings and token type embeddings have been - added. - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size] corresponding - to the output of the word(piece) embedding layer. - """ - return self.word_embedding_output - - def get_embedding_output(self): - """Gets output of the embedding lookup (i.e., input to the transformer). - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size] corresponding - to the output of the embedding layer, after summing the word - embeddings with the positional embeddings and the token type embeddings, - then performing layer normalization. This is the input to the transformer. - """ - return self.embedding_output - - def get_embedding_table(self): - return self.embedding_table - - -def gelu(x): - """Gaussian Error Linear Unit. - - This is a smoother version of the RELU. - Original paper: https://arxiv.org/abs/1606.08415 - Args: - x: float Tensor to perform activation. - - Returns: - `x` with the GELU activation applied. - """ - cdf = 0.5 * (1.0 + tf.tanh( - (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) - return x * cdf - - -def get_activation(activation_string): - """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. - - Args: - activation_string: String name of the activation function. - - Returns: - A Python function corresponding to the activation function. If - `activation_string` is None, empty, or "linear", this will return None. - If `activation_string` is not a string, it will return `activation_string`. - - Raises: - ValueError: The `activation_string` does not correspond to a known - activation. - """ - - # We assume that anything that"s not a string is already an activation - # function, so we just return it. - if not isinstance(activation_string, six.string_types): - return activation_string - - if not activation_string: - return None - - act = activation_string.lower() - if act == "linear": - return None - elif act == "relu": - return tf.nn.relu - elif act == "gelu": - return gelu - elif act == "tanh": - return tf.tanh - else: - raise ValueError("Unsupported activation: %s" % act) - - -def get_assignment_map_from_checkpoint(tvars, init_checkpoint): - """Compute the union of the current variables and checkpoint variables.""" - assignment_map = {} - initialized_variable_names = {} - - name_to_variable = collections.OrderedDict() - for var in tvars: - name = var.name - m = re.match("^(.*):\\d+$", six.ensure_str(name)) - if m is not None: - name = m.group(1) - name_to_variable[name] = var - - init_vars = tf.train.list_variables(init_checkpoint) - - assignment_map = collections.OrderedDict() - for x in init_vars: - (name, var) = (x[0], x[1]) - if name not in name_to_variable: - continue - assignment_map[name] = name - initialized_variable_names[name] = 1 - initialized_variable_names[name + ":0"] = 1 - - return (assignment_map, initialized_variable_names) - - -def dropout(input_tensor, dropout_prob): - """Perform dropout. - - Args: - input_tensor: float Tensor. - dropout_prob: Python float. The probability of dropping out a value (NOT of - *keeping* a dimension as in `tf.nn.dropout`). - - Returns: - A version of `input_tensor` with dropout applied. - """ - if dropout_prob is None or dropout_prob == 0.0: - return input_tensor - - output = tf.nn.dropout(input_tensor, rate=dropout_prob) - return output - - -def layer_norm(input_tensor, name=None): - """Run layer normalization on the last dimension of the tensor.""" - return slim.layers.layer_norm( - inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) - - -def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): - """Runs layer normalization followed by dropout.""" - output_tensor = layer_norm(input_tensor, name) - output_tensor = dropout(output_tensor, dropout_prob) - return output_tensor - - -def create_initializer(initializer_range=0.02): - """Creates a `truncated_normal_initializer` with the given range.""" - return tf.truncated_normal_initializer(stddev=initializer_range) - - -def embedding_lookup(input_ids, - vocab_size, - embedding_size=128, - initializer_range=0.02, - word_embedding_name="word_embeddings", - use_one_hot_embeddings=False, - trainable=True): - """Looks up words embeddings for id tensor. - - Args: - input_ids: int32 Tensor of shape [batch_size, seq_length] containing word - ids. - vocab_size: int. Size of the embedding vocabulary. - embedding_size: int. Width of the word embeddings. - initializer_range: float. Embedding initialization range. - word_embedding_name: string. Name of the embedding table. - use_one_hot_embeddings: bool. If True, use one-hot method for word - embeddings. If False, use `tf.nn.embedding_lookup()`. - trainable: bool. If True, the word embeddings are trainable. Otherwise they - will be fixed. This can be useful if we want to fix the embedding - parameters of a pretrained BERT model. - - Returns: - float Tensor of shape [batch_size, seq_length, embedding_size]. - """ - # This function assumes that the input is of shape [batch_size, seq_length, - # num_inputs]. - # - # If the input is a 2D tensor of shape [batch_size, seq_length], we - # reshape to [batch_size, seq_length, 1]. - if input_ids.shape.ndims == 2: - input_ids = tf.expand_dims(input_ids, axis=[-1]) - - embedding_table = tf.get_variable( - name=word_embedding_name, - shape=[vocab_size, embedding_size], - initializer=create_initializer(initializer_range), - trainable=trainable) - - if use_one_hot_embeddings: - flat_input_ids = tf.reshape(input_ids, [-1]) - one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) - output = tf.matmul(one_hot_input_ids, embedding_table) - else: - output = tf.nn.embedding_lookup(embedding_table, input_ids) - - input_shape = get_shape_list(input_ids) - - output = tf.reshape(output, - input_shape[0:-1] + [input_shape[-1] * embedding_size]) - return (output, embedding_table) - - -def embedding_postprocessor(input_tensor, - use_token_type=False, - token_type_ids=None, - token_type_vocab_size=16, - token_type_embedding_name="token_type_embeddings", - use_position_embeddings=True, - position_embedding_name="position_embeddings", - initializer_range=0.02, - max_position_embeddings=512, - dropout_prob=0.1, - trainable=True): - """Performs various post-processing on a word embedding tensor. - - Args: - input_tensor: float Tensor of shape [batch_size, seq_length, - embedding_size]. - use_token_type: bool. Whether to add embeddings for `token_type_ids`. - token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. - Must be specified if `use_token_type` is True. - token_type_vocab_size: int. The vocabulary size of `token_type_ids`. - token_type_embedding_name: string. The name of the embedding table variable - for token type ids. - use_position_embeddings: bool. Whether to add position embeddings for the - position of each token in the sequence. - position_embedding_name: string. The name of the embedding table variable - for positional embeddings. - initializer_range: float. Range of the weight initialization. - max_position_embeddings: int. Maximum sequence length that might ever be - used with this model. This can be longer than the sequence length of - input_tensor, but cannot be shorter. - dropout_prob: float. Dropout probability applied to the final output tensor. - trainable: bool. Whether the train parameters in the BERT model. - - Returns: - float tensor with same shape as `input_tensor`. - - Raises: - ValueError: One of the tensor shapes or input values is invalid. - """ - input_shape = get_shape_list(input_tensor, expected_rank=3) - batch_size = input_shape[0] - seq_length = input_shape[1] - width = input_shape[2] - - output = input_tensor - - if use_token_type: - if token_type_ids is None: - raise ValueError("`token_type_ids` must be specified if" - "`use_token_type` is True.") - token_type_table = tf.get_variable( - name=token_type_embedding_name, - shape=[token_type_vocab_size, width], - initializer=create_initializer(initializer_range), - trainable=trainable) - # This vocab will be small so we always do one-hot here, since it is always - # faster for a small vocabulary. - flat_token_type_ids = tf.reshape(token_type_ids, [-1]) - one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) - token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) - token_type_embeddings = tf.reshape(token_type_embeddings, - [batch_size, seq_length, width]) - output += token_type_embeddings - - if use_position_embeddings: - assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) - with tf.control_dependencies([assert_op]): - full_position_embeddings = tf.get_variable( - name=position_embedding_name, - shape=[max_position_embeddings, width], - initializer=create_initializer(initializer_range), - trainable=trainable) - # Since the position embedding table is a learned variable, we create it - # using a (long) sequence length `max_position_embeddings`. The actual - # sequence length might be shorter than this, for faster training of - # tasks that do not have long sequences. - # - # So `full_position_embeddings` is effectively an embedding table - # for position [0, 1, 2, ..., max_position_embeddings-1], and the current - # sequence has positions [0, 1, 2, ... seq_length-1], so we can just - # perform a slice. - position_embeddings = tf.slice(full_position_embeddings, [0, 0], - [seq_length, -1]) - num_dims = len(output.shape.as_list()) - - # Only the last two dimensions are relevant (`seq_length` and `width`), so - # we broadcast among the first dimensions, which is typically just - # the batch size. - position_broadcast_shape = [] - for _ in range(num_dims - 2): - position_broadcast_shape.append(1) - position_broadcast_shape.extend([seq_length, width]) - position_embeddings = tf.reshape(position_embeddings, - position_broadcast_shape) - output += position_embeddings - - output = layer_norm_and_dropout(output, dropout_prob) - return output - - -def create_attention_mask_from_input_mask_doc(batch_size, from_seq_length, - to_mask): - """Create 3D attention mask from a 2D tensor mask for the doc_bert model.""" - to_shape = get_shape_list(to_mask, expected_rank=2) - to_seq_length = to_shape[1] - to_mask = tf.cast( - tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) - broadcast_ones = tf.ones( - shape=[batch_size, from_seq_length, 1], dtype=tf.float32) - # Here we broadcast along two dimensions to create the mask. - mask = broadcast_ones * to_mask - return mask - - -def create_attention_mask_from_input_mask(from_tensor, to_mask): - """Create 3D attention mask from a 2D tensor mask. - - Args: - from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. - to_mask: int32 Tensor of shape [batch_size, to_seq_length]. - - Returns: - float Tensor of shape [batch_size, from_seq_length, to_seq_length]. - """ - from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) - batch_size = from_shape[0] - from_seq_length = from_shape[1] - - to_shape = get_shape_list(to_mask, expected_rank=2) - to_seq_length = to_shape[1] - - to_mask = tf.cast( - tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) - - # We don't assume that `from_tensor` is a mask (although it could be). We - # don't actually care if we attend *from* padding tokens (only *to* padding) - # tokens so we create a tensor of all ones. - # - # `broadcast_ones` = [batch_size, from_seq_length, 1] - broadcast_ones = tf.ones( - shape=[batch_size, from_seq_length, 1], dtype=tf.float32) - - # Here we broadcast along two dimensions to create the mask. - mask = broadcast_ones * to_mask - - return mask - - -def dense_layer_3d(input_tensor, - num_attention_heads, - size_per_head, - initializer, - activation, - name=None, - trainable=True): - """A dense layer with 3D kernel. - - Args: - input_tensor: float Tensor of shape [batch, seq_length, hidden_size]. - num_attention_heads: Number of attention heads. - size_per_head: The size per attention head. - initializer: Kernel initializer. - activation: Actication function. - name: The name scope of this layer. - trainable: Whether the model parameters are trainable. - - Returns: - float logits Tensor. - """ - - last_dim = get_shape_list(input_tensor)[-1] - - with tf.variable_scope(name): - w = tf.get_variable( - name="kernel", - shape=[last_dim, num_attention_heads * size_per_head], - initializer=initializer, - trainable=trainable) - w = tf.reshape(w, [last_dim, num_attention_heads, size_per_head]) - b = tf.get_variable( - name="bias", - shape=[num_attention_heads * size_per_head], - initializer=tf.zeros_initializer, - trainable=trainable) - b = tf.reshape(b, [num_attention_heads, size_per_head]) - ret = tf.einsum("abc,cde->abde", input_tensor, w) - ret += b - if activation is not None: - return activation(ret) - else: - return ret - - -def dense_layer_3d_proj(input_tensor, - hidden_size, - num_attention_heads, - head_size, - initializer, - activation, - name=None, - trainable=True): - """A dense layer with 3D kernel for projection. - - Args: - input_tensor: float Tensor of shape [batch,from_seq_length, - num_attention_heads, size_per_head]. - hidden_size: The size of hidden layer. - num_attention_heads: The size of output dimension. - head_size: The size of head. - initializer: Kernel initializer. - activation: Actication function. - name: The name scope of this layer. - trainable: Whether update the parameters. - - Returns: - float logits Tensor. - """ - head_size = hidden_size // num_attention_heads - with tf.variable_scope(name): - w = tf.get_variable( - name="kernel", - shape=[hidden_size, hidden_size], - initializer=initializer, - trainable=trainable) - w = tf.reshape(w, [num_attention_heads, head_size, hidden_size]) - b = tf.get_variable( - name="bias", - shape=[hidden_size], - initializer=tf.zeros_initializer, - trainable=trainable) - - ret = tf.einsum("BFNH,NHD->BFD", input_tensor, w) - ret += b - if activation is not None: - return activation(ret) - else: - return ret - - -def dense_layer_2d(input_tensor, - output_size, - initializer, - activation, - name=None, - trainable=True): - """A dense layer with 2D kernel. - - Args: - input_tensor: Float tensor with rank 3. - output_size: The size of output dimension. - initializer: Kernel initializer. - activation: Actication function. - name: The name scope of this layer. - trainable: Whether update the parameters. - - Returns: - float logits Tensor. - """ - last_dim = get_shape_list(input_tensor)[-1] - with tf.variable_scope(name): - w = tf.get_variable( - name="kernel", - shape=[last_dim, output_size], - initializer=initializer, - trainable=trainable) - b = tf.get_variable( - name="bias", - shape=[output_size], - initializer=tf.zeros_initializer, - trainable=trainable) - - ret = tf.einsum("abc,cd->abd", input_tensor, w) - ret += b - if activation is not None: - return activation(ret) - else: - return ret - - -def attention_layer(from_tensor, - to_tensor, - attention_mask=None, - num_attention_heads=1, - size_per_head=512, - query_act=None, - key_act=None, - value_act=None, - attention_probs_dropout_prob=0.0, - initializer_range=0.02, - batch_size=None, - from_seq_length=None, - to_seq_length=None, - trainable=True): - """Performs multi-headed attention from `from_tensor` to `to_tensor`. - - This is an implementation of multi-headed attention based on "Attention - is all you Need". If `from_tensor` and `to_tensor` are the same, then - this is self-attention. Each timestep in `from_tensor` attends to the - corresponding sequence in `to_tensor`, and returns a fixed-with vector. - - This function first projects `from_tensor` into a "query" tensor and - `to_tensor` into "key" and "value" tensors. These are (effectively) a list - of tensors of length `num_attention_heads`, where each tensor is of shape - [batch_size, seq_length, size_per_head]. - - Then, the query and key tensors are dot-producted and scaled. These are - softmaxed to obtain attention probabilities. The value tensors are then - interpolated by these probabilities, then concatenated back to a single - tensor and returned. - - In practice, the multi-headed attention are done with tf.einsum as follows: - Input_tensor: [BFD] - Wq, Wk, Wv: [DNH] - Q:[BFNH] = einsum('BFD,DNH->BFNH', Input_tensor, Wq) - K:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wk) - V:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wv) - attention_scores:[BNFT] = einsum('BFNH,BTNH>BNFT', Q, K) / sqrt(H) - attention_probs:[BNFT] = softmax(attention_scores) - context_layer:[BFNH] = einsum('BNFT,BTNH->BFNH', attention_probs, V) - Wout:[DNH] - Output:[BFD] = einsum('BFNH,DNH>BFD', context_layer, Wout) - - Args: - from_tensor: float Tensor of shape [batch_size, from_seq_length, - from_width]. - to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. - attention_mask: (optional) int32 Tensor of shape [batch_size, - from_seq_length, to_seq_length]. The values should be 1 or 0. The - attention scores will effectively be set to -infinity for any positions in - the mask that are 0, and will be unchanged for positions that are 1. - num_attention_heads: int. Number of attention heads. - size_per_head: int. Size of each attention head. - query_act: (optional) Activation function for the query transform. - key_act: (optional) Activation function for the key transform. - value_act: (optional) Activation function for the value transform. - attention_probs_dropout_prob: (optional) float. Dropout probability of the - attention probabilities. - initializer_range: float. Range of the weight initializer. - batch_size: (Optional) int. If the input is 2D, this might be the batch size - of the 3D version of the `from_tensor` and `to_tensor`. - from_seq_length: (Optional) If the input is 2D, this might be the seq length - of the 3D version of the `from_tensor`. - to_seq_length: (Optional) If the input is 2D, this might be the seq length - of the 3D version of the `to_tensor`. - trainable: Whether the parameters are trainable. - - Returns: - float Tensor of shape [batch_size, from_seq_length, num_attention_heads, - size_per_head]. - - Raises: - ValueError: Any of the arguments or tensor shapes are invalid. - """ - from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) - to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) - - if len(from_shape) != len(to_shape): - raise ValueError( - "The rank of `from_tensor` must match the rank of `to_tensor`.") - - if len(from_shape) == 3: - batch_size = from_shape[0] - from_seq_length = from_shape[1] - to_seq_length = to_shape[1] - elif len(from_shape) == 2: - if (batch_size is None or from_seq_length is None or to_seq_length is None): - raise ValueError( - "When passing in rank 2 tensors to attention_layer, the values " - "for `batch_size`, `from_seq_length`, and `to_seq_length` " - "must all be specified.") - - # Scalar dimensions referenced here: - # B = batch size (number of sequences) - # F = `from_tensor` sequence length - # T = `to_tensor` sequence length - # N = `num_attention_heads` - # H = `size_per_head` - - # `query_layer` = [B, F, N, H] - query_layer = dense_layer_3d( - from_tensor, - num_attention_heads, - size_per_head, - create_initializer(initializer_range), - query_act, - "query", - trainable=trainable) - - # `key_layer` = [B, T, N, H] - key_layer = dense_layer_3d( - to_tensor, - num_attention_heads, - size_per_head, - create_initializer(initializer_range), - key_act, - "key", - trainable=trainable) - - # `value_layer` = [B, T, N, H] - value_layer = dense_layer_3d( - to_tensor, - num_attention_heads, - size_per_head, - create_initializer(initializer_range), - value_act, - "value", - trainable=trainable) - - # Take the dot product between "query" and "key" to get the raw - # attention scores. - attention_scores = tf.einsum("BTNH,BFNH->BNFT", key_layer, query_layer) - attention_scores = tf.multiply(attention_scores, - 1.0 / math.sqrt(float(size_per_head))) - - if attention_mask is not None: - # `attention_mask` = [B, 1, F, T] - attention_mask = tf.expand_dims(attention_mask, axis=[1]) - - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and -10000.0 for masked positions. - adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 - - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - attention_scores += adder - - # Normalize the attention scores to probabilities. - # `attention_probs` = [B, N, F, T] - attention_probs = tf.nn.softmax(attention_scores) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = dropout(attention_probs, attention_probs_dropout_prob) - - # `context_layer` = [B, F, N, H] - context_layer = tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_layer) - - return context_layer - - -def transformer_model(input_tensor, - attention_mask=None, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - intermediate_act_fn=gelu, - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - initializer_range=0.02, - do_return_all_layers=False, - trainable=True): - """Multi-headed, multi-layer Transformer from "Attention is All You Need". - - This is almost an exact implementation of the original Transformer encoder. - - See the original paper: - https://arxiv.org/abs/1706.03762 - - Also see: - https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py - - Args: - input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. - attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, - seq_length], with 1 for positions that can be attended to and 0 in - positions that should not be. - hidden_size: int. Hidden size of the Transformer. - num_hidden_layers: int. Number of layers (blocks) in the Transformer. - num_attention_heads: int. Number of attention heads in the Transformer. - intermediate_size: int. The size of the "intermediate" (a.k.a., feed - forward) layer. - intermediate_act_fn: function. The non-linear activation function to apply - to the output of the intermediate/feed-forward layer. - hidden_dropout_prob: float. Dropout probability for the hidden layers. - attention_probs_dropout_prob: float. Dropout probability of the attention - probabilities. - initializer_range: float. Range of the initializer (stddev of truncated - normal). - do_return_all_layers: Whether to also return all layers or just the final - layer. - trainable: Whether the model parameter can be updated. - - Returns: - float Tensor of shape [batch_size, seq_length, hidden_size], the final - hidden layer of the Transformer. - - Raises: - ValueError: A Tensor shape or parameter is invalid. - """ - if hidden_size % num_attention_heads != 0: - raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (hidden_size, num_attention_heads)) - - attention_head_size = int(hidden_size / num_attention_heads) - input_shape = get_shape_list(input_tensor, expected_rank=3) - input_width = input_shape[2] - - # The Transformer performs sum residuals on all layers so the input needs - # to be the same as the hidden size. - if input_width != hidden_size: - raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % - (input_width, hidden_size)) - - prev_output = input_tensor - all_layer_outputs = [] - for layer_idx in range(num_hidden_layers): - with tf.variable_scope("layer_%d" % layer_idx, reuse=tf.AUTO_REUSE): - layer_input = prev_output - - with tf.variable_scope("attention"): - with tf.variable_scope("self"): - attention_output = attention_layer( - from_tensor=layer_input, - to_tensor=layer_input, - attention_mask=attention_mask, - num_attention_heads=num_attention_heads, - size_per_head=attention_head_size, - attention_probs_dropout_prob=attention_probs_dropout_prob, - initializer_range=initializer_range, - trainable=trainable) - - # Run a linear projection of `hidden_size` then add a residual - # with `layer_input`. - with tf.variable_scope("output"): - attention_output = dense_layer_3d_proj( - attention_output, - hidden_size, - num_attention_heads, - attention_head_size, - create_initializer(initializer_range), - None, - "dense", - trainable=trainable) - attention_output = dropout(attention_output, hidden_dropout_prob) - # Implementation of residual connections. - attention_output = layer_norm( - input_tensor=attention_output + layer_input) - - # The activation is only applied to the "intermediate" hidden layer. - with tf.variable_scope("intermediate"): - intermediate_output = dense_layer_2d( - attention_output, - intermediate_size, - create_initializer(initializer_range), - intermediate_act_fn, - "dense", - trainable=trainable) - - # Down-project back to `hidden_size` then add the residual. - with tf.variable_scope("output"): - layer_output = dense_layer_2d( - intermediate_output, - hidden_size, - create_initializer(initializer_range), - None, - "dense", - trainable=trainable) - layer_output = dropout(layer_output, hidden_dropout_prob) - layer_output = layer_norm( - input_tensor=layer_output + attention_output) - prev_output = layer_output - all_layer_outputs.append(layer_output) - - if do_return_all_layers: - return all_layer_outputs - else: - return all_layer_outputs[-1] - - -def get_shape_list(tensor, expected_rank=None, name=None): - """Returns a list of the shape of tensor, preferring static dimensions. - - Args: - tensor: A tf.Tensor object to find the shape of. - expected_rank: (optional) int. The expected rank of `tensor`. If this is - specified and the `tensor` has a different rank, and exception will be - thrown. - name: Optional name of the tensor for the error message. - - Returns: - A list of dimensions of the shape of tensor. All static dimensions will - be returned as python integers, and dynamic dimensions will be returned - as tf.Tensor scalars. - """ - if name is None: - name = tensor.name - - if expected_rank is not None: - assert_rank(tensor, expected_rank, name) - - shape = tensor.shape.as_list() - - non_static_indexes = [] - for (index, dim) in enumerate(shape): - if dim is None: - non_static_indexes.append(index) - - if not non_static_indexes: - return shape - - dyn_shape = tf.shape(tensor) - for index in non_static_indexes: - shape[index] = dyn_shape[index] - return shape - - -def reshape_to_matrix(input_tensor): - """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" - ndims = input_tensor.shape.ndims - if ndims < 2: - raise ValueError("Input tensor must have at least rank 2. Shape = %s" % - (input_tensor.shape)) - if ndims == 2: - return input_tensor - - width = input_tensor.shape[-1] - output_tensor = tf.reshape(input_tensor, [-1, width]) - return output_tensor - - -def reshape_from_matrix(output_tensor, orig_shape_list): - """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" - if len(orig_shape_list) == 2: - return output_tensor - - output_shape = get_shape_list(output_tensor) - - orig_dims = orig_shape_list[0:-1] - width = output_shape[-1] - - return tf.reshape(output_tensor, orig_dims + [width]) - - -def assert_rank(tensor, expected_rank, name=None): - """Raises an exception if the tensor rank is not of the expected rank. - - Args: - tensor: A tf.Tensor to check the rank of. - expected_rank: Python integer or list of integers, expected rank. - name: Optional name of the tensor for the error message. - - Raises: - ValueError: If the expected shape doesn't match the actual shape. - """ - if name is None: - name = tensor.name - - expected_rank_dict = {} - if isinstance(expected_rank, six.integer_types): - expected_rank_dict[expected_rank] = True - else: - for x in expected_rank: - expected_rank_dict[x] = True - - actual_rank = tensor.shape.ndims - if actual_rank not in expected_rank_dict: - scope_name = tf.get_variable_scope().name - raise ValueError( - "For the tensor `%s` in scope `%s`, the actual rank " - "`%d` (shape = %s) is not equal to the expected rank `%s`" % - (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling_test.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling_test.py deleted file mode 100644 index a8508bbf4f4710f8a61277eeadbb4c6d5173b557..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/modeling_test.py +++ /dev/null @@ -1,281 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * - -import collections -import json -import random -import re - -import six -from six.moves import range -import tensorflow.compat.v1 as tf -from smith.bert import modeling - - -class BertModelTest(tf.test.TestCase): - - class BertModelTester(object): - - def __init__(self, - parent, - batch_size=13, - seq_length=7, - is_training=True, - use_input_mask=True, - use_token_type_ids=True, - vocab_size=99, - hidden_size=32, - num_hidden_layers=5, - num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - initializer_range=0.02, - scope=None): - self.parent = parent - self.batch_size = batch_size - self.seq_length = seq_length - self.is_training = is_training - self.use_input_mask = use_input_mask - self.use_token_type_ids = use_token_type_ids - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.scope = scope - - def create_model(self): - input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], - self.vocab_size) - - input_mask = None - if self.use_input_mask: - input_mask = BertModelTest.ids_tensor( - [self.batch_size, self.seq_length], vocab_size=2) - - token_type_ids = None - if self.use_token_type_ids: - token_type_ids = BertModelTest.ids_tensor( - [self.batch_size, self.seq_length], self.type_vocab_size) - - config = modeling.BertConfig( - vocab_size=self.vocab_size, - hidden_size=self.hidden_size, - num_hidden_layers=self.num_hidden_layers, - num_attention_heads=self.num_attention_heads, - intermediate_size=self.intermediate_size, - hidden_act=self.hidden_act, - hidden_dropout_prob=self.hidden_dropout_prob, - attention_probs_dropout_prob=self.attention_probs_dropout_prob, - max_position_embeddings=self.max_position_embeddings, - type_vocab_size=self.type_vocab_size, - initializer_range=self.initializer_range) - - model = modeling.BertModel( - config=config, - is_training=self.is_training, - input_ids=input_ids, - input_mask=input_mask, - token_type_ids=token_type_ids, - scope=self.scope) - - outputs = { - "embedding_output": model.get_embedding_output(), - "sequence_output": model.get_sequence_output(), - "pooled_output": model.get_pooled_output(), - "all_encoder_layers": model.get_all_encoder_layers(), - } - return outputs - - def check_output(self, result): - self.parent.assertAllEqual( - result["embedding_output"].shape, - [self.batch_size, self.seq_length, self.hidden_size]) - - self.parent.assertAllEqual( - result["sequence_output"].shape, - [self.batch_size, self.seq_length, self.hidden_size]) - - self.parent.assertAllEqual(result["pooled_output"].shape, - [self.batch_size, self.hidden_size]) - - def test_default(self): - self.run_tester(BertModelTest.BertModelTester(self)) - - def test_config_to_json_string(self): - config = modeling.BertConfig(vocab_size=99, hidden_size=37) - obj = json.loads(config.to_json_string()) - self.assertEqual(obj["vocab_size"], 99) - self.assertEqual(obj["hidden_size"], 37) - - def run_tester(self, tester): - with self.test_session() as sess: - ops = tester.create_model() - init_op = tf.group(tf.global_variables_initializer(), - tf.local_variables_initializer()) - sess.run(init_op) - output_result = sess.run(ops) - tester.check_output(output_result) - - self.assert_all_tensors_reachable(sess, [init_op, ops]) - - @classmethod - def ids_tensor(cls, shape, vocab_size, rng=None, name=None): - """Creates a random int32 tensor of the shape within the vocab size.""" - if rng is None: - rng = random.Random() - - total_dims = 1 - for dim in shape: - total_dims *= dim - - values = [] - for _ in range(total_dims): - values.append(rng.randint(0, vocab_size - 1)) - - return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name) - - def assert_all_tensors_reachable(self, sess, outputs): - """Checks that all the tensors in the graph are reachable from outputs.""" - graph = sess.graph - - ignore_strings = [ - "^.*/assert_less_equal/.*$", - "^.*/dilation_rate$", - "^.*/Tensordot/concat$", - "^.*/Tensordot/concat/axis$", - "^testing/.*$", - ] - - ignore_regexes = [re.compile(x) for x in ignore_strings] - - unreachable = self.get_unreachable_ops(graph, outputs) - filtered_unreachable = [] - for x in unreachable: - do_ignore = False - for r in ignore_regexes: - m = r.match(x.name) - if m is not None: - do_ignore = True - if do_ignore: - continue - filtered_unreachable.append(x) - unreachable = filtered_unreachable - - self.assertEqual( - len(unreachable), 0, "The following ops are unreachable: %s" % - (" ".join([x.name for x in unreachable]))) - - @classmethod - def get_unreachable_ops(cls, graph, outputs): - """Finds all of the tensors in graph that are unreachable from outputs.""" - outputs = cls.flatten_recursive(outputs) - output_to_op = collections.defaultdict(list) - op_to_all = collections.defaultdict(list) - assign_out_to_in = collections.defaultdict(list) - - for op in graph.get_operations(): - for x in op.inputs: - op_to_all[op.name].append(x.name) - for y in op.outputs: - output_to_op[y.name].append(op.name) - op_to_all[op.name].append(y.name) - if str(op.type) == "Assign": - for y in op.outputs: - for x in op.inputs: - assign_out_to_in[y.name].append(x.name) - - assign_groups = collections.defaultdict(list) - for out_name in assign_out_to_in.keys(): - name_group = assign_out_to_in[out_name] - for n1 in name_group: - assign_groups[n1].append(out_name) - for n2 in name_group: - if n1 != n2: - assign_groups[n1].append(n2) - - seen_tensors = {} - stack = [x.name for x in outputs] - while stack: - name = stack.pop() - if name in seen_tensors: - continue - seen_tensors[name] = True - - if name in output_to_op: - for op_name in output_to_op[name]: - if op_name in op_to_all: - for input_name in op_to_all[op_name]: - if input_name not in stack: - stack.append(input_name) - - expanded_names = [] - if name in assign_groups: - for assign_name in assign_groups[name]: - expanded_names.append(assign_name) - - for expanded_name in expanded_names: - if expanded_name not in stack: - stack.append(expanded_name) - - unreachable_ops = [] - for op in graph.get_operations(): - is_unreachable = False - all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs] - for name in all_names: - if name not in seen_tensors: - is_unreachable = True - if is_unreachable: - unreachable_ops.append(op) - return unreachable_ops - - @classmethod - def flatten_recursive(cls, item): - """Flattens (potentially nested) a tuple/dictionary/list to a list.""" - output = [] - if isinstance(item, list): - output.extend(item) - elif isinstance(item, tuple): - output.extend(list(item)) - elif isinstance(item, dict): - for (_, v) in six.iteritems(item): - output.append(v) - else: - return [item] - - flat_output = [] - for x in output: - flat_output.extend(cls.flatten_recursive(x)) - return flat_output - - -if __name__ == "__main__": - tf.test.main() - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/optimization.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/optimization.py deleted file mode 100644 index ffdb8cf399c1c51d2c4448948d3b1bd8e8aca22f..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/optimization.py +++ /dev/null @@ -1,184 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Functions and classes related to optimization (weight updates).""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import re -import tensorflow.compat.v1 as tf -from npu_bridge.npu_init import * - -def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): - """Creates an optimizer training op.""" - global_step = tf.train.get_or_create_global_step() - - learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) - - # Implements linear decay of the learning rate. - learning_rate = tf.train.polynomial_decay( - learning_rate, - global_step, - num_train_steps, - end_learning_rate=0.0, - power=1.0, - cycle=False) - - # Implements linear warmup. I.e., if global_step < num_warmup_steps, the - # learning rate will be `global_step/num_warmup_steps * init_lr`. - if num_warmup_steps: - global_steps_int = tf.cast(global_step, tf.int32) - warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) - - global_steps_float = tf.cast(global_steps_int, tf.float32) - warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) - - warmup_percent_done = global_steps_float / warmup_steps_float - warmup_learning_rate = init_lr * warmup_percent_done - - is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) - learning_rate = ( - (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) - - # It is recommended that you use this optimizer for fine tuning, since this - # is how the model was trained (note that the Adam m/v variables are NOT - # loaded from init_checkpoint.) - optimizer = AdamWeightDecayOptimizer( - learning_rate=learning_rate, - weight_decay_rate=0.01, - beta_1=0.9, - beta_2=0.999, - epsilon=1e-6, - exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) - - # TODO:此处为开启LossScale排查精度问题所用,正式PR去除该部分 - """ - #loss_scale_manager = ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.8) - loss_scale_manager = ExponentialUpdateLossScaleManager(init_loss_scale=1, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.8) - optimizer = NPULossScaleOptimizer(optimizer, loss_scale_manager) - """ - - if use_tpu: - optimizer = tf.tpu.CrossShardOptimizer(optimizer) - - tvars = tf.trainable_variables() - grads = tf.gradients(loss, tvars) - - # This is how the model was pre-trained. - (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) - - train_op = optimizer.apply_gradients( - zip(grads, tvars), global_step=global_step) - - # Normally the global step update is done inside of `apply_gradients`. - # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use - # a different optimizer, you should probably take this line out. - new_global_step = global_step + 1 - train_op = tf.group(train_op, [global_step.assign(new_global_step)]) - - return train_op - - -class AdamWeightDecayOptimizer(tf.train.Optimizer): - """A basic Adam optimizer that includes "correct" L2 weight decay.""" - - def __init__(self, - learning_rate, - weight_decay_rate=0.0, - beta_1=0.9, - beta_2=0.999, - epsilon=1e-6, - exclude_from_weight_decay=None, - name="AdamWeightDecayOptimizer"): - """Constructs a AdamWeightDecayOptimizer.""" - super(AdamWeightDecayOptimizer, self).__init__(False, name) - - self.learning_rate = learning_rate - self.weight_decay_rate = weight_decay_rate - self.beta_1 = beta_1 - self.beta_2 = beta_2 - self.epsilon = epsilon - self.exclude_from_weight_decay = exclude_from_weight_decay - - def apply_gradients(self, grads_and_vars, global_step=None, name=None): - """See base class.""" - assignments = [] - for (grad, param) in grads_and_vars: - if grad is None or param is None: - continue - - param_name = self._get_variable_name(param.name) - - m = tf.get_variable( - name=param_name + "/adam_m", - shape=param.shape.as_list(), - dtype=tf.float32, - trainable=False, - initializer=tf.zeros_initializer()) - v = tf.get_variable( - name=param_name + "/adam_v", - shape=param.shape.as_list(), - dtype=tf.float32, - trainable=False, - initializer=tf.zeros_initializer()) - - # Standard Adam update. - next_m = ( - tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) - next_v = ( - tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, - tf.square(grad))) - - update = next_m / (tf.sqrt(next_v) + self.epsilon) - - # Just adding the square of the weights to the loss function is *not* - # the correct way of using L2 regularization/weight decay with Adam, - # since that will interact with the m and v parameters in strange ways. - # - # Instead we want ot decay the weights in a manner that doesn't interact - # with the m/v parameters. This is equivalent to adding the square - # of the weights to the loss with plain (non-momentum) SGD. - if self._do_use_weight_decay(param_name): - update += self.weight_decay_rate * param - - update_with_lr = self.learning_rate * update - - next_param = param - update_with_lr - - assignments.extend( - [param.assign(next_param), - m.assign(next_m), - v.assign(next_v)]) - - return tf.group(*assignments, name=name) - - def _do_use_weight_decay(self, param_name): - """Whether to use L2 weight decay for `param_name`.""" - if not self.weight_decay_rate: - return False - if self.exclude_from_weight_decay: - for r in self.exclude_from_weight_decay: - if re.search(r, param_name) is not None: - return False - return True - - def _get_variable_name(self, param_name): - """Get the variable name from the tensor name.""" - m = re.match("^(.*):\\d+$", param_name) - if m is not None: - param_name = m.group(1) - return param_name diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/optimization_test.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/optimization_test.py deleted file mode 100644 index 27dcefeeb816d18aa7613b4a3a3f3d4f8d61ff6d..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/optimization_test.py +++ /dev/null @@ -1,53 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * - -from six.moves import range -from six.moves import zip -import tensorflow.compat.v1 as tf -from smith.bert import optimization - - -class OptimizationTest(tf.test.TestCase): - - def test_adam(self): - with self.test_session() as sess: - w = tf.get_variable( - "w", - shape=[3], - initializer=tf.constant_initializer([0.1, -0.2, -0.1])) - x = tf.constant([0.4, 0.2, -0.5]) - loss = tf.reduce_mean(tf.square(x - w)) - tvars = tf.trainable_variables() - grads = tf.gradients(loss, tvars) - global_step = tf.train.get_or_create_global_step() - optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) - train_op = optimizer.apply_gradients(list(zip(grads, tvars)), global_step) - init_op = tf.group(tf.global_variables_initializer(), - tf.local_variables_initializer()) - sess.run(init_op) - for _ in range(100): - sess.run(train_op) - w_np = sess.run(w) - self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) - - -if __name__ == "__main__": - tf.test.main() - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/tokenization.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/tokenization.py deleted file mode 100644 index d647deb53eea86a4ab4624896fc3440a83e93f46..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/tokenization.py +++ /dev/null @@ -1,430 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Tokenization classes.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * - -import collections -import re -import unicodedata - -from absl import flags -import six -import tensorflow.compat.v1 as tf - -FLAGS = flags.FLAGS - -flags.DEFINE_bool( - "preserve_unused_tokens", False, - "If True, Wordpiece tokenization will not be applied to words in the vocab." -) - -_UNUSED_TOKEN_RE = re.compile("^\\[unused\\d+\\]$") - - -def preserve_token(token, vocab): - """Returns True if the token should forgo tokenization and be preserved.""" - if not FLAGS.preserve_unused_tokens: - return False - if token not in vocab: - return False - return bool(_UNUSED_TOKEN_RE.search(token)) - - -def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): - """Checks whether the casing config is consistent with the checkpoint name.""" - - # The casing has to be passed in by the user and there is no explicit check - # as to whether it matches the checkpoint. The casing information probably - # should have been stored in the bert_config.json file, but it's not, so - # we have to heuristically detect it to validate. - - if not init_checkpoint: - return - - m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) - if m is None: - return - - model_name = m.group(1) - - lower_models = [ - "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", - "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" - ] - - cased_models = [ - "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", - "multi_cased_L-12_H-768_A-12" - ] - - is_bad_config = False - if model_name in lower_models and not do_lower_case: - is_bad_config = True - actual_flag = "False" - case_name = "lowercased" - opposite_flag = "True" - - if model_name in cased_models and do_lower_case: - is_bad_config = True - actual_flag = "True" - case_name = "cased" - opposite_flag = "False" - - if is_bad_config: - raise ValueError( - "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " - "However, `%s` seems to be a %s model, so you " - "should pass in `--do_lower_case=%s` so that the fine-tuning matches " - "how the model was pre-training. If this error is wrong, please " - "just comment out this check." % (actual_flag, init_checkpoint, - model_name, case_name, opposite_flag)) - - -def convert_to_unicode(text): - """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text.decode("utf-8", "ignore") - elif isinstance(text, unicode): - return text - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - else: - raise ValueError("Not running on Python2 or Python 3?") - - -def printable_text(text): - """Returns text encoded in a way suitable for print or `tf.logging`.""" - - # These functions want `str` for both Python2 and Python3, but in one case - # it's a Unicode string and in the other it's a byte string. - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text - elif isinstance(text, unicode): - return text.encode("utf-8") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - else: - raise ValueError("Not running on Python2 or Python 3?") - - -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - with tf.gfile.GFile(vocab_file, "r") as reader: - while True: - token = convert_to_unicode(reader.readline()) - if not token: - break - token = token.strip() - if token not in vocab: - vocab[token] = len(vocab) - return vocab - - -def convert_by_vocab(vocab, items): - """Converts a sequence of [tokens|ids] using the vocab.""" - output = [] - for item in items: - output.append(vocab[item]) - return output - - -def convert_tokens_to_ids(vocab, tokens): - return convert_by_vocab(vocab, tokens) - - -def convert_ids_to_tokens(inv_vocab, ids): - return convert_by_vocab(inv_vocab, ids) - - -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a piece of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -class FullTokenizer(object): - """Runs end-to-end tokenization.""" - - def __init__(self, vocab_file, do_lower_case=True): - self.vocab = load_vocab(vocab_file) - self.inv_vocab = {v: k for k, v in self.vocab.items()} - self.basic_tokenizer = BasicTokenizer( - do_lower_case=do_lower_case, vocab=self.vocab) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - - def tokenize(self, text): - split_tokens = [] - for token in self.basic_tokenizer.tokenize(text): - if preserve_token(token, self.vocab): - split_tokens.append(token) - continue - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - - return split_tokens - - def convert_tokens_to_ids(self, tokens): - return convert_by_vocab(self.vocab, tokens) - - def convert_ids_to_tokens(self, ids): - return convert_by_vocab(self.inv_vocab, ids) - - -class BasicTokenizer(object): - """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - - def __init__(self, do_lower_case=True, vocab=tuple()): - """Constructs a BasicTokenizer. - - Args: - do_lower_case: Whether to lower case the input. - vocab: A container of tokens to not mutate during tokenization. - """ - self.do_lower_case = do_lower_case - self.vocab = vocab - - def tokenize(self, text): - """Tokenizes a piece of text.""" - text = convert_to_unicode(text) - text = self._clean_text(text) - - # This was added on November 1st, 2018 for the multilingual and Chinese - # models. This is also applied to the English models now, but it doesn't - # matter since the English models were not trained on any Chinese data - # and generally don't have any Chinese data in them (there are Chinese - # characters in the vocabulary because Wikipedia does have some Chinese - # words in the English Wikipedia.). - text = self._tokenize_chinese_chars(text) - - orig_tokens = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if preserve_token(token, self.vocab): - split_tokens.append(token) - continue - if self.do_lower_case: - token = token.lower() - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token)) - - output_tokens = whitespace_tokenize(" ".join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) - - def _run_split_on_punc(self, text): - """Splits punctuation on a piece of text.""" - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return ["".join(x) for x in output] - - def _tokenize_chinese_chars(self, text): - """Adds whitespace around any CJK character.""" - output = [] - for char in text: - cp = ord(char) - if self._is_chinese_char(cp): - output.append(" ") - output.append(char) - output.append(" ") - else: - output.append(char) - return "".join(output) - - def _is_chinese_char(self, cp): - """Checks whether CP is the codepoint of a CJK character.""" - # This defines a "chinese character" as anything in the CJK Unicode block: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - # - # Note that the CJK Unicode block is NOT all Japanese and Korean characters, - # despite its name. The modern Korean Hangul alphabet is a different block, - # as is Japanese Hiragana and Katakana. Those alphabets are used to write - # space-separated words, so they are not treated specially and handled - # like all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or # - (cp >= 0x3400 and cp <= 0x4DBF) or # - (cp >= 0x20000 and cp <= 0x2A6DF) or # - (cp >= 0x2A700 and cp <= 0x2B73F) or # - (cp >= 0x2B740 and cp <= 0x2B81F) or # - (cp >= 0x2B820 and cp <= 0x2CEAF) or - (cp >= 0xF900 and cp <= 0xFAFF) or # - (cp >= 0x2F800 and cp <= 0x2FA1F)): # - return True - - return False - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xfffd or _is_control(char): - continue - if _is_whitespace(char): - output.append(" ") - else: - output.append(char) - return "".join(output) - - -class WordpieceTokenizer(object): - """Runs WordPiece tokenization.""" - - def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """Tokenizes a piece of text into its word pieces. - - This uses a greedy longest-match-first algorithm to perform tokenization - using the given vocabulary. - - For example: - input = "unaffable" - output = ["un", "##aff", "##able"] - - Args: - text: A single token or whitespace separated tokens. This should have - already been passed through `BasicTokenizer`. - - Returns: - A list of wordpiece tokens. - """ - - text = convert_to_unicode(text) - - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue - - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - while start < end: - substr = "".join(chars[start:end]) - if start > 0: - substr = "##" + substr - if substr in self.vocab: - cur_substr = substr - break - end -= 1 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - start = end - - if is_bad: - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens - - -def _is_whitespace(char): - """Checks whether `char` is a whitespace character.""" - # \t, \n, and \r are technically control characters but we treat them - # as whitespace since they are generally considered as such. - if char == " " or char == "\t" or char == "\n" or char == "\r": - return True - cat = unicodedata.category(char) - if cat == "Zs": - return True - return False - - -def _is_control(char): - """Checks whether `char` is a control character.""" - # These are technically control characters but we count them as whitespace - # characters. - if char == "\t" or char == "\n" or char == "\r": - return False - cat = unicodedata.category(char) - if cat in ("Cc", "Cf"): - return True - return False - - -def _is_punctuation(char): - """Checks whether `char` is a punctuation character.""" - cp = ord(char) - # We treat all non-letter/number ASCII as punctuation. - # Characters such as "^", "$", and "`" are not in the Unicode - # Punctuation class but we treat them as punctuation anyways, for - # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): - return True - cat = unicodedata.category(char) - if cat.startswith("P"): - return True - return False - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/tokenization_test.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/tokenization_test.py deleted file mode 100644 index af195497c38c5d44c152930d439760c3c6886cec..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/bert/tokenization_test.py +++ /dev/null @@ -1,166 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from npu_bridge.npu_init import * - -import os -import tempfile - -from absl import flags -from absl.testing import flagsaver -import six -import tensorflow.compat.v1 as tf - -from smith.bert import tokenization - -FLAGS = flags.FLAGS - - -class TokenizationTest(tf.test.TestCase): - - def tokenize_with_full_tokenizer(self): - """Returns tokens and ids processed with FullTokenizer.""" - text = u"UNwant\u00E9d,running [unused0] [CLS] [unused55]" - vocab_tokens = [ - "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", - "##ing", ",", "[unused0]" - ] - with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: - if six.PY2: - vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) - else: - vocab_writer.write("".join( - [x + "\n" for x in vocab_tokens]).encode("utf-8")) - - vocab_file = vocab_writer.name - - tokenizer = tokenization.FullTokenizer(vocab_file) - os.unlink(vocab_file) - tokens = tokenizer.tokenize(text) - ids = tokenizer.convert_tokens_to_ids(tokens) - return tokens, ids - - def test_full_tokenizer(self): - tokens, ids = self.tokenize_with_full_tokenizer() - - self.assertAllEqual(tokens, [ - "un", "##want", "##ed", ",", "runn", "##ing", "[UNK]", "[UNK]", "[UNK]", - "[UNK]", "[UNK]", "[UNK]", "[UNK]", "[UNK]", "[UNK]" - ]) - - self.assertAllEqual(ids, [7, 4, 5, 10, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0]) - - def test_full_tokenizer_preserve_unused(self): - with flagsaver.flagsaver(preserve_unused_tokens=True): - tokens, ids = self.tokenize_with_full_tokenizer() - - self.assertAllEqual(tokens, [ - "un", "##want", "##ed", ",", "runn", "##ing", "[unused0]", "[UNK]", - "[UNK]", "[UNK]", "[UNK]", "[UNK]", "[UNK]" - ]) - - self.assertAllEqual(ids, [7, 4, 5, 10, 8, 9, 11, 0, 0, 0, 0, 0, 0]) - - def test_chinese(self): - tokenizer = tokenization.BasicTokenizer() - - self.assertAllEqual( - tokenizer.tokenize(u"ah\u535A\u63A8zz"), - [u"ah", u"\u535A", u"\u63A8", u"zz"]) - - def test_basic_tokenizer_lower(self): - tokenizer = tokenization.BasicTokenizer(do_lower_case=True) - - self.assertAllEqual( - tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), - ["hello", "!", "how", "are", "you", "?"]) - self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) - - def test_basic_tokenizer_no_lower(self): - tokenizer = tokenization.BasicTokenizer(do_lower_case=False) - - self.assertAllEqual( - tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), - ["HeLLo", "!", "how", "Are", "yoU", "?"]) - - def test_wordpiece_tokenizer(self): - vocab_tokens = [ - "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", - "##ing" - ] - - vocab = {} - for (i, token) in enumerate(vocab_tokens): - vocab[token] = i - tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) - - self.assertAllEqual(tokenizer.tokenize(""), []) - - self.assertAllEqual( - tokenizer.tokenize("unwanted running"), - ["un", "##want", "##ed", "runn", "##ing"]) - - self.assertAllEqual( - tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) - - def test_convert_tokens_to_ids(self): - vocab_tokens = [ - "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", - "##ing" - ] - - vocab = {} - for (i, token) in enumerate(vocab_tokens): - vocab[token] = i - - self.assertAllEqual( - tokenization.convert_tokens_to_ids( - vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9]) - - def test_is_whitespace(self): - self.assertTrue(tokenization._is_whitespace(u" ")) - self.assertTrue(tokenization._is_whitespace(u"\t")) - self.assertTrue(tokenization._is_whitespace(u"\r")) - self.assertTrue(tokenization._is_whitespace(u"\n")) - self.assertTrue(tokenization._is_whitespace(u"\u00A0")) - - self.assertFalse(tokenization._is_whitespace(u"A")) - self.assertFalse(tokenization._is_whitespace(u"-")) - - def test_is_control(self): - self.assertTrue(tokenization._is_control(u"\u0005")) - - self.assertFalse(tokenization._is_control(u"A")) - self.assertFalse(tokenization._is_control(u" ")) - self.assertFalse(tokenization._is_control(u"\t")) - self.assertFalse(tokenization._is_control(u"\r")) - self.assertFalse(tokenization._is_control(u"\U0001F4A9")) - - def test_is_punctuation(self): - self.assertTrue(tokenization._is_punctuation(u"-")) - self.assertTrue(tokenization._is_punctuation(u"$")) - self.assertTrue(tokenization._is_punctuation(u"`")) - self.assertTrue(tokenization._is_punctuation(u".")) - - self.assertFalse(tokenization._is_punctuation(u"A")) - self.assertFalse(tokenization._is_punctuation(u" ")) - - -if __name__ == "__main__": - tf.test.main() - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/bert_config.json b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/bert_config.json deleted file mode 100644 index fca794a5f07ff8f963fe8b61e3694b0fb7f955df..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/bert_config.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 3072, - "max_position_embeddings": 512, - "num_attention_heads": 12, - "num_hidden_layers": 12, - "type_vocab_size": 2, - "vocab_size": 30522 -} diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/doc_bert_3l_256h_config.json b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/doc_bert_3l_256h_config.json deleted file mode 100644 index 054f9b23f171c2b945752dda047902a5be9cc1e5..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/doc_bert_3l_256h_config.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 256, - "initializer_range": 0.02, - "intermediate_size": 512, - "max_position_embeddings": 128, - "num_attention_heads": 4, - "num_hidden_layers": 3, - "type_vocab_size": 2, - "vocab_size": 30522 -} diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/doc_bert_3l_768h_config.json b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/doc_bert_3l_768h_config.json deleted file mode 100644 index 47ee23fc4698aed581d925401fcaeae3e7e19a15..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/doc_bert_3l_768h_config.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 768, - "initializer_range": 0.02, - "intermediate_size": 512, - "max_position_embeddings": 128, - "num_attention_heads": 4, - "num_hidden_layers": 3, - "type_vocab_size": 2, - "vocab_size": 30522 -} diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/dual_encoder_config.smith_short.32.8.pbtxt b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/dual_encoder_config.smith_short.32.8.pbtxt deleted file mode 100644 index 4eddbcbbef5dffe196e829fa89969fdb2e710043..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/dual_encoder_config.smith_short.32.8.pbtxt +++ /dev/null @@ -1,37 +0,0 @@ -# Definition of the experimental settings. -# proto-file: smith/experiment_config.proto -# proto-message: DualEncoderConfig - -encoder_config { - model_name: "smith_dual_encoder" - init_checkpoint: "/home/test_user06/tc_workspace/data/bert/uncased_L-12_H-768_A-12/bert_model.ckpt" - bert_config_file: "/home/test_user06/tc_workspace/tmp/smith_force_fp32/config/bert_config.json" - doc_bert_config_file: "/home/test_user06/tc_workspace/tmp/smith_force_fp32/config/doc_bert_3l_768h_config.json" - vocab_file: "/home/test_user06/tc_workspace/data/bert/uncased_L-12_H-768_A-12/vocab.txt" - max_seq_length: 32 - max_predictions_per_seq: 5 - max_sent_length_by_word: 32 - max_doc_length_by_sentence: 64 - loop_sent_number_per_doc: 8 - sent_bert_trainable: true - max_masked_sent_per_doc: 0 - use_masked_sentence_lm_loss: false - num_labels: 2 - doc_rep_combine_mode: "normal" - doc_rep_combine_attention_size: 256 -} -train_eval_config { - input_file_for_train: "/home/test_user06/tc_workspace/data/output_file/tianchen_0311.tfrecord" - input_file_for_eval: "/home/test_user06/tc_workspace/data/output_file/tianchen_0311.tfrecord" - train_batch_size: 32 - eval_batch_size: 32 - predict_batch_size: 32 - max_eval_steps: 54 - save_checkpoints_steps: 10 - iterations_per_loop: 10 - eval_with_eval_data: true - neg_to_pos_example_ratio: 1.0 -} -loss_config { - similarity_score_amplifier: 6.0 -} diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/dual_encoder_config.smith_short.32.8_orig.pbtxt b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/dual_encoder_config.smith_short.32.8_orig.pbtxt deleted file mode 100644 index faf7b2409a1f28134cbcf7613236136924727049..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/dual_encoder_config.smith_short.32.8_orig.pbtxt +++ /dev/null @@ -1,37 +0,0 @@ -# Definition of the experimental settings. -# proto-file: smith/experiment_config.proto -# proto-message: DualEncoderConfig - -encoder_config { - model_name: "smith_dual_encoder" - init_checkpoint: "/tmp/data/uncased_L-12_H-768_A-12/bert_model.ckpt" - bert_config_file: "/tmp/data/config/bert_config.json" - doc_bert_config_file: "/tmp/data/config/doc_bert_3l_768h_config.json" - vocab_file: "/tmp/data/uncased_L-12_H-768_A-12/vocab.txt" - max_seq_length: 32 - max_predictions_per_seq: 5 - max_sent_length_by_word: 32 - max_doc_length_by_sentence: 64 - loop_sent_number_per_doc: 8 - sent_bert_trainable: true - max_masked_sent_per_doc: 0 - use_masked_sentence_lm_loss: false - num_labels: 2 - doc_rep_combine_mode: "normal" - doc_rep_combine_attention_size: 256 -} -train_eval_config { - input_file_for_train: "/tmp/data/gwikimatch_v2_human_neg_1.train.smith_msenl_32_mdl_64_lm_false.tfrecord" - input_file_for_eval: "/tmp/data/gwikimatch_v2_human_neg_1.eval_external_wdp_smith_32_64_false.tfrecord" - train_batch_size: 32 - eval_batch_size: 32 - predict_batch_size: 32 - max_eval_steps: 54 - save_checkpoints_steps: 10 - iterations_per_loop: 10 - eval_with_eval_data: true - neg_to_pos_example_ratio: 1.0 -} -loss_config { - similarity_score_amplifier: 6.0 -} diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/dual_encoder_config.smith_wsp.32.48.pbtxt b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/dual_encoder_config.smith_wsp.32.48.pbtxt deleted file mode 100644 index 76dc21e2c6de90c2f5450fe93e6ba5407af5bb2d..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/dual_encoder_config.smith_wsp.32.48.pbtxt +++ /dev/null @@ -1,37 +0,0 @@ -# Definition of the experimental settings. -# proto-file: smith/experiment_config.proto -# proto-message: DualEncoderConfig - -encoder_config { - model_name: "smith_dual_encoder" - init_checkpoint: "/home/test_user06/tc_workspace/source_data/smith_wsp_pretrain_ckpt_opensource/model.ckpt-400000" - bert_config_file: "/home/test_user06/tc_workspace/tmp/smith/config/sent_bert_4l_config.json" - doc_bert_config_file: "/home/test_user06/tc_workspace/tmp/smith/config/doc_bert_3l_256h_config.json" - vocab_file: "/home/test_user06/tc_workspace/data/uncased_L-12_H-768_A-12/vocab.txt" - max_seq_length: 32 - max_predictions_per_seq: 5 - max_sent_length_by_word: 32 - max_doc_length_by_sentence: 64 - loop_sent_number_per_doc: 48 - sent_bert_trainable: true - max_masked_sent_per_doc: 0 - use_masked_sentence_lm_loss: false - num_labels: 2 - doc_rep_combine_mode: "normal" - doc_rep_combine_attention_size: 256 -} -train_eval_config { - input_file_for_train: "/home/test_user06/tc_workspace/data/output_file/tianchen_0311.tfrecord" - input_file_for_eval: "/home/test_user06/tc_workspace/data/output_file/tianchen_0311.tfrecord" - train_batch_size: 32 - eval_batch_size: 32 - predict_batch_size: 32 - max_eval_steps: 54 - save_checkpoints_steps: 10 - iterations_per_loop: 10 - eval_with_eval_data: true - neg_to_pos_example_ratio: 1.0 -} -loss_config { - similarity_score_amplifier: 6.0 -} diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/sent_bert_4l_config.json b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/sent_bert_4l_config.json deleted file mode 100644 index 7798a2b459858ec804f753524011ed2c5b2dcf8d..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/sent_bert_4l_config.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 256, - "initializer_range": 0.02, - "intermediate_size": 512, - "max_position_embeddings": 256, - "num_attention_heads": 4, - "num_hidden_layers": 4, - "type_vocab_size": 2, - "vocab_size": 30522 -} diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/sent_bert_6l_config.json b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/sent_bert_6l_config.json deleted file mode 100644 index de1d26cd4f68e3d6268a2bf5fbeae73bf0df8398..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/config/sent_bert_6l_config.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 256, - "initializer_range": 0.02, - "intermediate_size": 512, - "max_position_embeddings": 256, - "num_attention_heads": 4, - "num_hidden_layers": 6, - "type_vocab_size": 2, - "vocab_size": 30522 -} diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/constants.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/constants.py deleted file mode 100644 index c4156a668cfd0398a012366ad7c89e9de2ea93bb..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/constants.py +++ /dev/null @@ -1,56 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# Lint as: python3 -"""Constants used in dual encoder SMITH model.""" - -# Constants related to training mode. -# There are three different modes for the model training: -# Joint_train: the loss includes both masked LM losses and the text matching -# loss. -# Pretrain: the loss includes masked word LM loss and masked sentence LM loss. -# The masked sentence LM loss only applies to the dual encoder SMITH model. -# Finetune: the loss includes the text matching loss. -TRAIN_MODE_FINETUNE = "finetune" -TRAIN_MODE_PRETRAIN = "pretrain" -TRAIN_MODE_JOINT_TRAIN = "joint_train" - -# Constants related to model name. -MODEL_NAME_SMITH_DUAL_ENCODER = "smith_dual_encoder" - -# Constants related to final document representation combining method. -DOC_COMBINE_NORMAL = "normal" -DOC_COMBINE_SUM_CONCAT = "sum_concat" -DOC_COMBINE_MEAN_CONCAT = "mean_concat" -DOC_COMBINE_ATTENTION = "attention" - -# Constants related to human rating aggregation methhod. -RATING_AGG_MEAN = "mean" -RATING_AGG_MAJORITY_VOTE = "majority_vote" diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/experiment_config.proto b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/experiment_config.proto deleted file mode 100644 index 6032d25d2a141d52f03bc947a56d9e903e8d94e6..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/experiment_config.proto +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2021 The Google Research Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto2"; - -package smith; - -// Proto on the model setting and input data sets of dual encoder SMITH/BERT. - -// Proto to specify train/eval datasets and train/eval settings. -// Next Available Field: 13 -message TrainEvalConfig { - // File patterns for train set, separated by commas if you have multiple - // files. This field is required. - optional string input_file_for_train = 1; - - // File patterns for eval set, separated by commas if you have multiple - // files. - optional string input_file_for_eval = 2; - - // Total batch size for training. - optional int32 train_batch_size = 4 [default = 32]; - - // Total batch size for evaluation. - optional int32 eval_batch_size = 5 [default = 32]; - - // Total batch size for prediction. - optional int32 predict_batch_size = 6 [default = 32]; - - // Maximum number of eval steps. - // This should be set according to the size of eval data. During model - // pre-training, we can also use a part of training data for evaluation. - optional int32 max_eval_steps = 7 [default = 100]; - - // How often to save the model checkpoint. - optional int32 save_checkpoints_steps = 8 [default = 1000]; - - // How many steps to make in each estimator call. - optional int32 iterations_per_loop = 9 [default = 1000]; - - // This is set to true if we awalys want to evaluate the model with the eval - // or test data even in the pre-train mode, so that we know whether the model - // overfits the training data. - optional bool eval_with_eval_data = 10 [default = true]; - - // The weight to compensate when we have more negative examples. - optional float neg_to_pos_example_ratio = 12 [default = 1.0]; -} - -// Configuration for BERT-based or SMITH-based encoder. -// Next Available Field: 18 -message EncoderConfig { - // The name of the model. - optional string model_name = 12 [default = "smith_dual_encoder"]; - - // Which pretrained checkpoint to use. This field is required for fine-tuning. - optional string init_checkpoint = 1; - - // Which prediction checkpoint to use for model prediction process. - optional string predict_checkpoint = 2; - - // Where is the bert config file. - optional string bert_config_file = 3; - - // Where is the document level bert config file, which is only used in the - // the SMITH model. - optional string doc_bert_config_file = 4; - - // Where is the vocab file. - optional string vocab_file = 5; - - // This is only used for the BERT model. The maximum total input sequence - // length after tokenization. Sequences longer than this will be truncated, - // and sequences shorter than this will be padded. Normally, this should be - // no larger than the one used in pretraining. This should be matched with - // the data generation settings. - optional int32 max_seq_length = 6 [default = 32]; - - // Maximum number of masked LM predictions per sequence. Note that for the - // SMITH model, the maximum number of masked LM predictions per document is - // max_doc_length_by_sentence * max_predictions_per_seq. - optional int32 max_predictions_per_seq = 7 [default = 5]; - - // This is only used for the SMITH model. The maximum number of tokens in a - // sentence. - optional int32 max_sent_length_by_word = 8 [default = 32]; - - // This is only used for the SMITH model. The maximum number of sentences in - // a document. - optional int32 max_doc_length_by_sentence = 9 [default = 64]; - - // This is only used for the SMITH model. The number of looped sentences in a - // document to control the used TPU memory. This number should be shorter - // than the setting of max_doc_length_by_sentence. - optional int32 loop_sent_number_per_doc = 10 [default = 64]; - - // This is only used for the SMITH model. Whether update the parameters in - // the sentence level Tranformers of the SMITH model. - optional bool sent_bert_trainable = 11 [default = true]; - - // This is only used for the SMITH model. The maximum number of sentences to - // be masked in each document. - optional int32 max_masked_sent_per_doc = 14 [default = 2]; - - // This is only used for the SMITH model. If true, add the masked sentence LM - // loss into the total training loss. - optional bool use_masked_sentence_lm_loss = 15 [default = false]; - - // The number of different labels in classification task. - optional int32 num_labels = 13 [default = 2]; - - // The type of document representation combing mode. It can be normal, - // sum_concat, mean_concat or attention. - optional string doc_rep_combine_mode = 16 [default = "normal"]; - - // The size of the attention vector in the attention layer for combining the - // sentence level representations to generate the document level - // representations. - optional int32 doc_rep_combine_attention_size = 17 [default = 256]; -} - -// Configuration for a loss function. -// Next Available Field: 2 -message LossConfig { - // Hyperparameters for the loss function. - // The amplifier to increase the logits value, so that sigmoid(logits) is - // closer to 0 or 1. The default value is 6.0. - optional float similarity_score_amplifier = 1 [default = 6.0]; -} - -// Next Available Field: 4 -message DualEncoderConfig { - // Config for the BERT/SMITH based dual encoder. - - optional EncoderConfig encoder_config = 1; - - // This field must be set to supply the train/eval data. - optional TrainEvalConfig train_eval_config = 2; - - // Config for optimization, this field is required. - optional LossConfig loss_config = 3; -} diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/experiment_config_pb2.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/experiment_config_pb2.py deleted file mode 100644 index d3b9a24c6c98ae0485031400eb4604925b968da3..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/experiment_config_pb2.py +++ /dev/null @@ -1,399 +0,0 @@ -# coding=utf-8 -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: smith/experiment_config.proto -"""Generated protocol buffer code.""" -from npu_bridge.npu_init import * -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - - - -DESCRIPTOR = _descriptor.FileDescriptor( - name='smith/experiment_config.proto', - package='smith', - syntax='proto2', - serialized_options=None, - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x1dsmith/experiment_config.proto\x12\x05smith\"\xd5\x02\n\x0fTrainEvalConfig\x12\x1c\n\x14input_file_for_train\x18\x01 \x01(\t\x12\x1b\n\x13input_file_for_eval\x18\x02 \x01(\t\x12\x1c\n\x10train_batch_size\x18\x04 \x01(\x05:\x02\x33\x32\x12\x1b\n\x0f\x65val_batch_size\x18\x05 \x01(\x05:\x02\x33\x32\x12\x1e\n\x12predict_batch_size\x18\x06 \x01(\x05:\x02\x33\x32\x12\x1b\n\x0emax_eval_steps\x18\x07 \x01(\x05:\x03\x31\x30\x30\x12$\n\x16save_checkpoints_steps\x18\x08 \x01(\x05:\x04\x31\x30\x30\x30\x12!\n\x13iterations_per_loop\x18\t \x01(\x05:\x04\x31\x30\x30\x30\x12!\n\x13\x65val_with_eval_data\x18\n \x01(\x08:\x04true\x12#\n\x18neg_to_pos_example_ratio\x18\x0c \x01(\x02:\x01\x31\"\xc8\x04\n\rEncoderConfig\x12&\n\nmodel_name\x18\x0c \x01(\t:\x12smith_dual_encoder\x12\x17\n\x0finit_checkpoint\x18\x01 \x01(\t\x12\x1a\n\x12predict_checkpoint\x18\x02 \x01(\t\x12\x18\n\x10\x62\x65rt_config_file\x18\x03 \x01(\t\x12\x1c\n\x14\x64oc_bert_config_file\x18\x04 \x01(\t\x12\x12\n\nvocab_file\x18\x05 \x01(\t\x12\x1a\n\x0emax_seq_length\x18\x06 \x01(\x05:\x02\x33\x32\x12\"\n\x17max_predictions_per_seq\x18\x07 \x01(\x05:\x01\x35\x12#\n\x17max_sent_length_by_word\x18\x08 \x01(\x05:\x02\x33\x32\x12&\n\x1amax_doc_length_by_sentence\x18\t \x01(\x05:\x02\x36\x34\x12$\n\x18loop_sent_number_per_doc\x18\n \x01(\x05:\x02\x36\x34\x12!\n\x13sent_bert_trainable\x18\x0b \x01(\x08:\x04true\x12\"\n\x17max_masked_sent_per_doc\x18\x0e \x01(\x05:\x01\x32\x12*\n\x1buse_masked_sentence_lm_loss\x18\x0f \x01(\x08:\x05\x66\x61lse\x12\x15\n\nnum_labels\x18\r \x01(\x05:\x01\x32\x12$\n\x14\x64oc_rep_combine_mode\x18\x10 \x01(\t:\x06normal\x12+\n\x1e\x64oc_rep_combine_attention_size\x18\x11 \x01(\x05:\x03\x32\x35\x36\"3\n\nLossConfig\x12%\n\x1asimilarity_score_amplifier\x18\x01 \x01(\x02:\x01\x36\"\x9c\x01\n\x11\x44ualEncoderConfig\x12,\n\x0e\x65ncoder_config\x18\x01 \x01(\x0b\x32\x14.smith.EncoderConfig\x12\x31\n\x11train_eval_config\x18\x02 \x01(\x0b\x32\x16.smith.TrainEvalConfig\x12&\n\x0bloss_config\x18\x03 \x01(\x0b\x32\x11.smith.LossConfig' -) - - - - -_TRAINEVALCONFIG = _descriptor.Descriptor( - name='TrainEvalConfig', - full_name='smith.TrainEvalConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='input_file_for_train', full_name='smith.TrainEvalConfig.input_file_for_train', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='input_file_for_eval', full_name='smith.TrainEvalConfig.input_file_for_eval', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='train_batch_size', full_name='smith.TrainEvalConfig.train_batch_size', index=2, - number=4, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=32, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='eval_batch_size', full_name='smith.TrainEvalConfig.eval_batch_size', index=3, - number=5, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=32, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='predict_batch_size', full_name='smith.TrainEvalConfig.predict_batch_size', index=4, - number=6, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=32, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='max_eval_steps', full_name='smith.TrainEvalConfig.max_eval_steps', index=5, - number=7, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=100, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='save_checkpoints_steps', full_name='smith.TrainEvalConfig.save_checkpoints_steps', index=6, - number=8, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=1000, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='iterations_per_loop', full_name='smith.TrainEvalConfig.iterations_per_loop', index=7, - number=9, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=1000, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='eval_with_eval_data', full_name='smith.TrainEvalConfig.eval_with_eval_data', index=8, - number=10, type=8, cpp_type=7, label=1, - has_default_value=True, default_value=True, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='neg_to_pos_example_ratio', full_name='smith.TrainEvalConfig.neg_to_pos_example_ratio', index=9, - number=12, type=2, cpp_type=6, label=1, - has_default_value=True, default_value=float(1), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=41, - serialized_end=382, -) - - -_ENCODERCONFIG = _descriptor.Descriptor( - name='EncoderConfig', - full_name='smith.EncoderConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='model_name', full_name='smith.EncoderConfig.model_name', index=0, - number=12, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=b"smith_dual_encoder".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='init_checkpoint', full_name='smith.EncoderConfig.init_checkpoint', index=1, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='predict_checkpoint', full_name='smith.EncoderConfig.predict_checkpoint', index=2, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='bert_config_file', full_name='smith.EncoderConfig.bert_config_file', index=3, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='doc_bert_config_file', full_name='smith.EncoderConfig.doc_bert_config_file', index=4, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='vocab_file', full_name='smith.EncoderConfig.vocab_file', index=5, - number=5, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='max_seq_length', full_name='smith.EncoderConfig.max_seq_length', index=6, - number=6, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=32, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='max_predictions_per_seq', full_name='smith.EncoderConfig.max_predictions_per_seq', index=7, - number=7, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=5, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='max_sent_length_by_word', full_name='smith.EncoderConfig.max_sent_length_by_word', index=8, - number=8, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=32, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='max_doc_length_by_sentence', full_name='smith.EncoderConfig.max_doc_length_by_sentence', index=9, - number=9, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=64, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='loop_sent_number_per_doc', full_name='smith.EncoderConfig.loop_sent_number_per_doc', index=10, - number=10, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=64, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='sent_bert_trainable', full_name='smith.EncoderConfig.sent_bert_trainable', index=11, - number=11, type=8, cpp_type=7, label=1, - has_default_value=True, default_value=True, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='max_masked_sent_per_doc', full_name='smith.EncoderConfig.max_masked_sent_per_doc', index=12, - number=14, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=2, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='use_masked_sentence_lm_loss', full_name='smith.EncoderConfig.use_masked_sentence_lm_loss', index=13, - number=15, type=8, cpp_type=7, label=1, - has_default_value=True, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='num_labels', full_name='smith.EncoderConfig.num_labels', index=14, - number=13, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=2, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='doc_rep_combine_mode', full_name='smith.EncoderConfig.doc_rep_combine_mode', index=15, - number=16, type=9, cpp_type=9, label=1, - has_default_value=True, default_value=b"normal".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='doc_rep_combine_attention_size', full_name='smith.EncoderConfig.doc_rep_combine_attention_size', index=16, - number=17, type=5, cpp_type=1, label=1, - has_default_value=True, default_value=256, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=385, - serialized_end=969, -) - - -_LOSSCONFIG = _descriptor.Descriptor( - name='LossConfig', - full_name='smith.LossConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='similarity_score_amplifier', full_name='smith.LossConfig.similarity_score_amplifier', index=0, - number=1, type=2, cpp_type=6, label=1, - has_default_value=True, default_value=float(6), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=971, - serialized_end=1022, -) - - -_DUALENCODERCONFIG = _descriptor.Descriptor( - name='DualEncoderConfig', - full_name='smith.DualEncoderConfig', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='encoder_config', full_name='smith.DualEncoderConfig.encoder_config', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='train_eval_config', full_name='smith.DualEncoderConfig.train_eval_config', index=1, - number=2, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='loss_config', full_name='smith.DualEncoderConfig.loss_config', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1025, - serialized_end=1181, -) - -_DUALENCODERCONFIG.fields_by_name['encoder_config'].message_type = _ENCODERCONFIG -_DUALENCODERCONFIG.fields_by_name['train_eval_config'].message_type = _TRAINEVALCONFIG -_DUALENCODERCONFIG.fields_by_name['loss_config'].message_type = _LOSSCONFIG -DESCRIPTOR.message_types_by_name['TrainEvalConfig'] = _TRAINEVALCONFIG -DESCRIPTOR.message_types_by_name['EncoderConfig'] = _ENCODERCONFIG -DESCRIPTOR.message_types_by_name['LossConfig'] = _LOSSCONFIG -DESCRIPTOR.message_types_by_name['DualEncoderConfig'] = _DUALENCODERCONFIG -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -TrainEvalConfig = _reflection.GeneratedProtocolMessageType('TrainEvalConfig', (_message.Message,), { - 'DESCRIPTOR' : _TRAINEVALCONFIG, - '__module__' : 'smith.experiment_config_pb2' - # @@protoc_insertion_point(class_scope:smith.TrainEvalConfig) - }) -_sym_db.RegisterMessage(TrainEvalConfig) - -EncoderConfig = _reflection.GeneratedProtocolMessageType('EncoderConfig', (_message.Message,), { - 'DESCRIPTOR' : _ENCODERCONFIG, - '__module__' : 'smith.experiment_config_pb2' - # @@protoc_insertion_point(class_scope:smith.EncoderConfig) - }) -_sym_db.RegisterMessage(EncoderConfig) - -LossConfig = _reflection.GeneratedProtocolMessageType('LossConfig', (_message.Message,), { - 'DESCRIPTOR' : _LOSSCONFIG, - '__module__' : 'smith.experiment_config_pb2' - # @@protoc_insertion_point(class_scope:smith.LossConfig) - }) -_sym_db.RegisterMessage(LossConfig) - -DualEncoderConfig = _reflection.GeneratedProtocolMessageType('DualEncoderConfig', (_message.Message,), { - 'DESCRIPTOR' : _DUALENCODERCONFIG, - '__module__' : 'smith.experiment_config_pb2' - # @@protoc_insertion_point(class_scope:smith.DualEncoderConfig) - }) -_sym_db.RegisterMessage(DualEncoderConfig) - - -# @@protoc_insertion_point(module_scope) - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/input_fns.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/input_fns.py deleted file mode 100644 index 5b52cf636db13965527d0ad457b054cab1ebf00c..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/input_fns.py +++ /dev/null @@ -1,219 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Input functions used in dual encoder SMITH model.""" -from npu_bridge.npu_init import * - - -from absl import flags -import tensorflow.compat.v1 as tf # tf -from smith import constants -FLAGS = flags.FLAGS - - -def input_fn_builder(input_files, - is_training, - drop_remainder, - max_seq_length=32, - max_predictions_per_seq=5, - num_cpu_threads=4, - batch_size=16, - is_prediction=False): - """Creates an `input_fn` closure to be passed to TPUEstimator.""" - - def input_fn(params): # pylint: disable=unused-argument - """The actual input function.""" - name_to_features = { - "input_ids_1": tf.FixedLenFeature([max_seq_length], tf.int64), - "input_mask_1": tf.FixedLenFeature([max_seq_length], tf.int64), - "input_ids_2": tf.FixedLenFeature([max_seq_length], tf.int64), - "input_mask_2": tf.FixedLenFeature([max_seq_length], tf.int64), - "documents_match_labels": tf.FixedLenFeature([1], tf.float32, 0) - } - if (FLAGS.train_mode == constants.TRAIN_MODE_PRETRAIN or - FLAGS.train_mode == constants.TRAIN_MODE_JOINT_TRAIN): - # Add some features related to word masked LM losses. - name_to_features["masked_lm_positions_1"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.int64) - name_to_features["masked_lm_ids_1"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.int64) - name_to_features["masked_lm_weights_1"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.float32) - name_to_features["masked_lm_positions_2"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.int64) - name_to_features["masked_lm_ids_2"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.int64) - name_to_features["masked_lm_weights_2"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.float32) - - # For training, we want a lot of parallel reading and shuffling. - # For eval, we want no shuffling and parallel reading doesn't matter. - if is_training: - file_list = tf.data.Dataset.list_files(tf.constant(input_files)) - file_list = file_list.shuffle(buffer_size=len(input_files)) - # `cycle_length` is the number of parallel files that get read. - cycle_length = min(num_cpu_threads, len(input_files)) - # `sloppy` mode means that the interleaving is not exact. This adds - # even more randomness to the training pipeline. - d = file_list.apply( - tf.data.experimental.parallel_interleave( - tf.data.TFRecordDataset, - sloppy=is_training, - cycle_length=cycle_length)) - d = d.repeat() - d = d.shuffle(buffer_size=100) - - else: - d = tf.data.TFRecordDataset(tf.constant(input_files)) - # In contrast to TPU training/evaluation, the input_fn for prediction - # should raise an end-of-input exception (OutOfRangeError or - # StopIteration), which serves as the stopping signal to TPUEstimator. - # Thus during model prediction, the data can not be repeated forever. - # Refer to - # https://www.tensorflow.org/api_docs/python/tf/compat/v1/estimator/tpu/TPUEstimator#predict - if not is_prediction: - # Since we evaluate for a fixed number of steps we don't want to - # encounter out-of-range exceptions. - d = d.repeat() - - # We must `drop_remainder` on training because the TPU requires fixed - # size dimensions. For eval, we assume we are evaling on the CPU or GPU - # and we *don"t* want to drop the remainder, otherwise we won't cover - # every sample. - d = d.apply( - tf.data.experimental.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - num_parallel_batches=num_cpu_threads, - drop_remainder=True)) - return d - - return input_fn - - -def _decode_record(record, name_to_features): - """Decodes a record to a TensorFlow example.""" - example = tf.parse_single_example(record, name_to_features) - # tf.Example only supports tf.int64, but the TPU only supports tf.int32. - # So cast all int64 to int32. - example["input_ids_1"] = tf.cast(example["input_ids_1"], tf.int32) - example["input_ids_2"] = tf.cast(example["input_ids_2"], tf.int32) - example["documents_match_labels"] = tf.cast(example["documents_match_labels"], - tf.float32) - example["input_mask_1"] = tf.cast(example["input_mask_1"], tf.int32) - example["input_mask_2"] = tf.cast(example["input_mask_2"], tf.int32) - if (FLAGS.train_mode == constants.TRAIN_MODE_PRETRAIN or - FLAGS.train_mode == constants.TRAIN_MODE_JOINT_TRAIN): - example["masked_lm_ids_1"] = tf.cast(example["masked_lm_ids_1"], tf.int32) - example["masked_lm_ids_2"] = tf.cast(example["masked_lm_ids_2"], tf.int32) - example["masked_lm_weights_1"] = tf.cast(example["masked_lm_weights_1"], - tf.float32) - example["masked_lm_weights_2"] = tf.cast(example["masked_lm_weights_2"], - tf.float32) - example["masked_lm_positions_1"] = tf.cast(example["masked_lm_positions_1"], - tf.int32) - example["masked_lm_positions_2"] = tf.cast(example["masked_lm_positions_2"], - tf.int32) - return example - - -def make_serving_input_example_fn(max_seq_length=32, max_predictions_per_seq=5): - """Returns an Estimator input_fn for serving the model. - - Args: - max_seq_length: The max input sequence length. - max_predictions_per_seq: The max number of masked words per sequence. - - Returns: - An Estimator input_fn for serving the model. - """ - - def _serving_input_fn(): - """An input_fn that expects a serialized tf.Example.""" - - serialized_example = tf.placeholder( - dtype=tf.string, shape=[None], name="examples") - receiver_tensors = {"examples": serialized_example} - name_to_features = { - "input_ids_1": tf.FixedLenFeature([max_seq_length], tf.int64), - "input_mask_1": tf.FixedLenFeature([max_seq_length], tf.int64), - "input_ids_2": tf.FixedLenFeature([max_seq_length], tf.int64), - "input_mask_2": tf.FixedLenFeature([max_seq_length], tf.int64), - "documents_match_labels": tf.FixedLenFeature([1], tf.float32, 0) - } - if (FLAGS.train_mode == constants.TRAIN_MODE_PRETRAIN or - FLAGS.train_mode == constants.TRAIN_MODE_JOINT_TRAIN): - # This is to support model export during model pretraining or - # joint-training process. - name_to_features["masked_lm_positions_1"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.int64) - name_to_features["masked_lm_ids_1"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.int64) - name_to_features["masked_lm_weights_1"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.float32) - name_to_features["masked_lm_positions_2"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.int64) - name_to_features["masked_lm_ids_2"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.int64) - name_to_features["masked_lm_weights_2"] = tf.FixedLenFeature( - [max_predictions_per_seq], tf.float32) - - parsed_features = tf.parse_example(serialized_example, name_to_features) - # As tf.Example only supports tf.int64, but the TPU only supports - # tf.int32, we need to cast all int64 to int32. - parsed_features["input_ids_1"] = tf.cast(parsed_features["input_ids_1"], - tf.int32) - parsed_features["input_ids_2"] = tf.cast(parsed_features["input_ids_2"], - tf.int32) - parsed_features["documents_match_labels"] = tf.cast( - parsed_features["documents_match_labels"], tf.float32) - parsed_features["input_mask_1"] = tf.cast(parsed_features["input_mask_1"], - tf.int32) - parsed_features["input_mask_2"] = tf.cast(parsed_features["input_mask_2"], - tf.int32) - if (FLAGS.train_mode == constants.TRAIN_MODE_PRETRAIN or - FLAGS.train_mode == constants.TRAIN_MODE_JOINT_TRAIN): - parsed_features["masked_lm_ids_1"] = tf.cast( - parsed_features["masked_lm_ids_1"], tf.int32) - parsed_features["masked_lm_ids_2"] = tf.cast( - parsed_features["masked_lm_ids_2"], tf.int32) - parsed_features["masked_lm_weights_1"] = tf.cast( - parsed_features["masked_lm_weights_1"], tf.float32) - parsed_features["masked_lm_weights_2"] = tf.cast( - parsed_features["masked_lm_weights_2"], tf.float32) - parsed_features["masked_lm_positions_1"] = tf.cast( - parsed_features["masked_lm_positions_1"], tf.int32) - parsed_features["masked_lm_positions_2"] = tf.cast( - parsed_features["masked_lm_positions_2"], tf.int32) - return tf.estimator.export.ServingInputReceiver( - features=parsed_features, receiver_tensors=receiver_tensors) - - return _serving_input_fn - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers.py deleted file mode 100644 index 2d1d1ee7df4a2e1c54d77aec7b315b29a32796fe..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/layers.py +++ /dev/null @@ -1,606 +0,0 @@ -# coding=utf-8 -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Model layers in dual encoder SMITH model.""" -from npu_bridge.npu_init import * -from six.moves import range -from npu_bridge.estimator.npu import npu_convert_dropout -import tensorflow.compat.v1 as tf - -from smith import constants -from smith.bert import modeling - - -def get_doc_rep_with_masked_sent(input_sent_reps_doc, - sent_mask_embedding, - input_mask_doc_level, - batch_size_static=32, - max_masked_sent_per_doc=2, - loop_sent_number_per_doc=32): - """Get the document representations with masked sentences. - - Args: - input_sent_reps_doc: float Tensor. The independent sentence embeddings - without masks for the sentences in the current document. The shape is - [batch, loop_sent_number_per_doc, hidden]. - sent_mask_embedding: float Tensor. The sentence embedding vector for the - masked position. The shape is [hidden]. - input_mask_doc_level: int Tensor. The input masks on the document level to - identify whether a location is a real sentence (mask = 1) or a padded - sentence (mask = 0). The shape is [batch, loop_sent_number_per_doc]. - batch_size_static: scalar. The static batch size depending on the training - or the evaluation mode. - max_masked_sent_per_doc: scalar. The maximum number of masked sentences - per document. - loop_sent_number_per_doc: scalar. The number of looped sentences per - document. - - Returns: - The document representations with masked sentences and the positions/ - weights for each masked sentences. This masked sentence weight is 1 for the - sampled real sentence position and 0 for the padded sentence position. - """ - # We at least mask two sentences to build a candidate sentence pool for - # negative sentence sampling. We generate the masked_sent_index and - # masked_sent_weight for each document. Note that we do not add any word - # or sentence level masks during prediction or inference stage. - max_masked_sent_per_doc = max(max_masked_sent_per_doc, 2) - input_sent_reps_doc_list = tf.unstack( - input_sent_reps_doc, num=batch_size_static) - real_sent_number_per_doc = tf.unstack( - tf.reduce_sum(input_mask_doc_level, 1), num=batch_size_static) - masked_sent_index_list = [] - masked_sent_weight_list = [] - - # For each example in the current batch, we randomly sample - # max_masked_sent_per_doc positions to mask the sentences. For each masked - # sentence position, the sentence in the current position is the positive - # example. The other co-masked sentences are the negative examples. - # The sampled sentence indexes will not be duplicated. - for batch_i in range(0, batch_size_static): - # Since everything in TPU must have a fixed shape, here the max sampled - # sentence index can be as large as loop_sent_number_per_doc. We will - # generate the corresponding sentence LM weights to reduce the impact - # on the final masked sentence LM loss following a similar way with the - # handling of masked word LM loss and masked word LM weights. - real_sent_number = real_sent_number_per_doc[batch_i] - sampled_sent_index = tf.slice( - tf.random_shuffle(tf.range(loop_sent_number_per_doc)), [0], - [max_masked_sent_per_doc]) - sampled_sent_index = tf.sort(sampled_sent_index) - masked_sent_index_list.append(sampled_sent_index) - # Generates the corresponding sampled_sent_weight - sample_sent_weight = tf.cast( - tf.less(sampled_sent_index, real_sent_number), tf.float32) - masked_sent_weight_list.append(sample_sent_weight) - - indices = tf.reshape(sampled_sent_index, [max_masked_sent_per_doc, -1]) - # Duplicates sent_mask_embedding for each masked position. - updates = tf.reshape( - tf.tile( - sent_mask_embedding, - [max_masked_sent_per_doc], - ), [max_masked_sent_per_doc, -1]) - input_sent_reps_doc_list[batch_i] = tf.tensor_scatter_update( - input_sent_reps_doc_list[batch_i], indices, updates) - # Here masked_sent_index_list is a list a tensors, where each tensor stores - # the masked sentence positions for each document in the current batch. The - # shape of masked_sent_index_list is [batch, max_masked_sent_per_doc]. - # Here masked_sent_weight_list is a list a tensors, where each tensor stores - # the masked sentence weights for each document in the current batch. The - # shape of masked_sent_weight_list is [batch, max_masked_sent_per_doc]. - return (tf.stack(input_sent_reps_doc_list), tf.stack(masked_sent_index_list), - tf.stack(masked_sent_weight_list)) - - -def get_masked_sent_lm_output(bert_config, - input_tensor, - cur_sent_reps_doc_unmask, - sent_masked_positions, - sent_masked_weights, - debugging=False): - """Get the sentence level masked LM loss. - - Args: - bert_config: BertConfig object. The configuration file for the document - level BERT model. - input_tensor: float Tensor. The contextualized representations of all - sentences learned by the document level BERT model. The shape is [batch, - loop_sent_number_per_doc, hidden]. This is the model prediction. - cur_sent_reps_doc_unmask: float Tensor. The unmasked sentence - representations of the current document. The shape is [batch, - loop_sent_number_per_doc, hidden]. This is the source of the ground - truth and negative examples in the masked sentence prediction. - sent_masked_positions: int Tensor. The masked sentence positions in the - current document. The shape is [batch, max_masked_sent_per_doc]. - sent_masked_weights: float Tensor. The masked sentence weights in the - current document. The shape is [batch, max_masked_sent_per_doc]. - debugging: bool. Whether it is in the debugging mode. - - Returns: - The masked sentence LM loss and the mask sentence LM loss per example. - - """ - # The current method for masked sentence prediction: we approach this problem - # as a multi-class classification problem similar to the masked word LM task. - # For each masked sentence position, the sentence in the current position is - # the positive example. The other co-masked sentences in the current document - # and in the other documents of the same batch are the negative examples. We - # compute the cross entropy loss over the sentence prediction task following - # the implementation of the masked word LM loss in the BERT model. - - input_tensor_shape = modeling.get_shape_list(input_tensor) - batch_size = input_tensor_shape[0] - masked_position_shape = modeling.get_shape_list(sent_masked_positions) - max_predictions_per_seq = masked_position_shape[1] - - # In the context of masked sentence prediction, the max_predictions_per_seq - # is the same with max_masked_sent_per_doc. - # Output Shape: [batch * max_predictions_per_seq, hidden]. - # Input_tensor is the model prediction for each position. - input_tensor = gather_indexes(input_tensor, sent_masked_positions) - # Independent_sent_embeddings is the ground truth input sentence embeddings - # for the document level BERT model. The output shape is [batch * - # max_predictions_per_seq, hidden]. - independent_sent_embeddings = gather_indexes(cur_sent_reps_doc_unmask, - sent_masked_positions) - - with tf.variable_scope("cls/sent_predictions", reuse=tf.AUTO_REUSE): - # We apply one more non-linear transformation before the output layer. - # This matrix is not used after pre-training. - with tf.variable_scope("transform"): - input_tensor = tf.layers.dense( - input_tensor, - units=bert_config.hidden_size, - activation=modeling.get_activation(bert_config.hidden_act), - kernel_initializer=modeling.create_initializer( - bert_config.initializer_range)) - # Output Shape: [batch * max_predictions_per_seq, hidden]. - input_tensor = modeling.layer_norm(input_tensor) - - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each predicted position. - output_bias = tf.get_variable( - "output_bias", - shape=[batch_size * max_predictions_per_seq], - initializer=tf.zeros_initializer()) - # Shape of input_tensor [batch * max_predictions_per_seq, hidden]. - # Shape of independent_sent_embeddings is [batch * max_predictions_per_seq, - # hidden]. - # Shape of logits: [batch * max_predictions_per_seq, - # batch * max_predictions_per_seq]. - logits = tf.matmul( - input_tensor, independent_sent_embeddings, transpose_b=True) - logits = tf.nn.bias_add(logits, output_bias) - # Output Shape: [batch * max_predictions_per_seq, - # batch * max_predictions_per_seq]. - log_probs = tf.nn.log_softmax(logits, axis=-1) - - # Output Shape: [batch * max_predictions_per_seq]. - # Double checked the setting of label_ids here. The label_ids - # should be the label index in the "sentence vocabulary". Thus if batch=32, - # max_predictions_per_seq = 2, then label ids should be like - # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ..., 63]. For the ground truth one hot - # label matrix, only the values in the diagonal positions are 1. All the - # other positions should be 0. - label_ids = tf.range( - 0, batch_size * max_predictions_per_seq, dtype=tf.int32) - if debugging: - label_ids = tf.Print( - label_ids, [label_ids], - message="label_ids in get_masked_sent_lm_output", - summarize=30) - # Output Shape: [batch * max_predictions_per_seq]. - # The label_weights is the flatten vector based on sent_masked_weights, - # where the weight is 1.0 for sampled real sentences and 0.0 for sampled - # masked sentences. - label_weights = tf.reshape(sent_masked_weights, [-1]) - - # Output Shape: [batch * max_predictions_per_seq, - # batch * max_predictions_per_seq]. - one_hot_labels = tf.one_hot( - label_ids, depth=batch_size * max_predictions_per_seq, dtype=tf.float32) - - # Output Shape: [batch * max_predictions_per_seq]. - per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) - # Output Shape: [1]. - numerator = tf.reduce_sum(label_weights * per_example_loss) - # Output Shape: [1]. - denominator = tf.reduce_sum(label_weights) + 1e-5 - # Output Shape: [1]. - loss = numerator / denominator - # Shape of loss [1]. - # Shape of per_example_loss is [batch * max_predictions_per_seq]. - return (loss, per_example_loss, log_probs) - - -def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, - label_ids, label_weights): - """Get loss and log probs for the masked LM.""" - # Output Shape: [batch * max_predictions_per_seq, hidden]. - input_tensor = gather_indexes(input_tensor, positions) - - with tf.variable_scope("cls/word_predictions", reuse=tf.AUTO_REUSE): - # We apply one more non-linear transformation before the output layer. - # This matrix is not used after pre-training. - with tf.variable_scope("transform"): - input_tensor = tf.layers.dense( - input_tensor, - units=bert_config.hidden_size, - activation=modeling.get_activation(bert_config.hidden_act), - kernel_initializer=modeling.create_initializer( - bert_config.initializer_range)) - # Output Shape: [batch * max_predictions_per_seq, hidden]. - input_tensor = modeling.layer_norm(input_tensor) - - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - output_bias = tf.get_variable( - "output_bias", - shape=[bert_config.vocab_size], - initializer=tf.zeros_initializer()) - # Shape of input_tensor [batch * max_predictions_per_seq, embedding_size]. - # Shape of output_weights (embed table) is [vocab_size, embedding_size]. - # In the current Bert implementation: embedding_size = hidden. - logits = tf.matmul(input_tensor, output_weights, transpose_b=True) - logits = tf.nn.bias_add(logits, output_bias) - # Output Shape: [batch * max_predictions_per_seq, vocab_size]. - log_probs = tf.nn.log_softmax(logits, axis=-1) - - # Output Shape: [batch * max_predictions_per_seq]. - label_ids = tf.reshape(label_ids, [-1]) - # Output Shape: [batch * max_predictions_per_seq]. - label_weights = tf.reshape(label_weights, [-1]) - - # Output Shape: [batch * max_predictions_per_seq, vocab_size]. - one_hot_labels = tf.one_hot( - label_ids, depth=bert_config.vocab_size, dtype=tf.float32) - - # The `positions` tensor might be zero-padded (if the sequence is too - # short to have the maximum number of predictions). The `label_weights` - # tensor has a value of 1.0 for every real prediction and 0.0 for the - # padding predictions. - # Output Shape: [batch * max_predictions_per_seq]. - per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) - # Output Shape: [1]. - numerator = tf.reduce_sum(label_weights * per_example_loss) - # Output Shape: [1]. - denominator = tf.reduce_sum(label_weights) + 1e-5 - # Output Shape: [1]. - loss = numerator / denominator - # Shape of loss [1]. - # Shape of per_example_loss is [batch * max_predictions_per_seq]. - return (loss, per_example_loss, log_probs) - - -def gather_indexes(sequence_tensor, positions): - """Gathers the vectors at the specific positions over a minibatch.""" - # Shape of positions: [batch, max_mask_per_seq]. - sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) - batch_size = sequence_shape[0] - seq_length = sequence_shape[1] - width = sequence_shape[2] - - # Shape of flat_offsets: [batch, 1]. - flat_offsets = tf.reshape( - tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) - flat_positions = tf.reshape(positions + flat_offsets, [-1]) - flat_sequence_tensor = tf.reshape(sequence_tensor, - [batch_size * seq_length, width]) - output_tensor = tf.gather(flat_sequence_tensor, flat_positions) - # The shape of output_tensor [batch * max_mask_per_seq, hidden]. - return output_tensor - - -def get_attention_weighted_sum(input_tensor, bert_config, is_training, - attention_size): - """Compute the attentive weighted sum of an input tensor. - - Args: - input_tensor: The input tensor for attentive representation. The shape of - input tensor is [batch, seq_length, hidden]. - bert_config: The model config file. - is_training: If true, it is in training mode. - attention_size: int. Dimension of contextual vector. - - Returns: - The attentive representation of the input tensor. The shape of the output - tensor is [batch, hidden]. - """ - with tf.variable_scope("combine_reps_attention", reuse=tf.AUTO_REUSE): - context_vector = tf.get_variable( - name="context_vector", - shape=[attention_size], - dtype=tf.float32) - # Output Shape: [batch, seq_length, attention_size]. - projection = tf.layers.dense( - input_tensor, - attention_size, - activation=tf.tanh, - kernel_initializer=modeling.create_initializer( - bert_config.initializer_range)) - # Output Shape: [batch, seq_length, 1]. - attention = tf.reduce_sum( - tf.multiply(projection, context_vector), axis=2, keep_dims=True) - # Output Shape: [batch, seq_length, 1]. - attention = tf.nn.softmax(attention, axis=1) - # Output Shape: [batch, hidden]. - last_outputs = tf.reduce_sum(tf.multiply(input_tensor, attention), axis=1) - if is_training: - last_outputs = tf.layers.dropout( - last_outputs, bert_config.attention_probs_dropout_prob, training=True) - return last_outputs - - -def get_seq_rep_from_bert(bert_model): - """Get the sequence represenation given a BERT encoder.""" - siamese_input_tensor = bert_model.get_pooled_output() - hidden_size = siamese_input_tensor.shape[-1].value - siamese_input_tensor = tf.layers.dense( - siamese_input_tensor, units=hidden_size, activation=tf.nn.relu) - normalized_siamese_input_tensor = tf.nn.l2_normalize( - siamese_input_tensor, axis=1) - return normalized_siamese_input_tensor - - -class GetSentRepsMasksNormalLoop(object): - - def __init__(self, input_sent_reps_doc=None, input_mask_doc_level=None, masked_lm_loss_doc=None, - masked_lm_example_loss_doc=None, masked_lm_weights_doc=None): - self.input_sent_reps_doc = input_sent_reps_doc - self.input_mask_doc_level = input_mask_doc_level - self.masked_lm_loss_doc = masked_lm_loss_doc - self.masked_lm_example_loss_doc = masked_lm_example_loss_doc - self.masked_lm_weights_doc = masked_lm_weights_doc - - def get_sent_reps_masks_normal_loop(self, sent_index, - input_sent_reps_doc, - input_mask_doc_level, - masked_lm_loss_doc, - masked_lm_example_loss_doc, - masked_lm_weights_doc, - dual_encoder_config, - is_training, - train_mode, - input_ids, - input_mask, - masked_lm_positions, - masked_lm_ids, - masked_lm_weights, - use_one_hot_embeddings, - debugging=False): - """Get the sentence encodings, mask ids and masked word LM loss. - - Args: - sent_index: The index of the current looped sentence. - input_sent_reps_doc: The representations of all sentences in the doc - learned by BERT. - input_mask_doc_level: The document level input masks, which indicates - whether a sentence is a real sentence or a padded sentence. - masked_lm_loss_doc: The sum of all the masked word LM loss. - masked_lm_example_loss_doc: The per example masked word LM loss. - masked_lm_weights_doc: the weights of the maksed LM words. If the position - is corresponding to a real masked word, it is 1.0; It is a padded mask, - the weight is 0. - dual_encoder_config: The config of the dual encoder. - is_training: Whether it is in the training mode. - train_mode: string. The train mode which can be finetune, joint_train, or - pretrain. - input_ids: The ids of the input tokens. - input_mask: The mask of the input tokens. - masked_lm_positions: The positions of the masked words in the language - model training. - masked_lm_ids: The ids of the masked words in LM model training. - masked_lm_weights: The weights of the masked words in LM model training. - use_one_hot_embeddings: Whether use one hot embedding. It should be true - for the runs on TPUs. - debugging: bool. Whether it is in the debugging mode. - - Returns: - A list of tensors on the learned sentence representations and the masked - word LM loss. - """ - # Collect token information for the current sentence. - bert_config = modeling.BertConfig.from_json_file( - dual_encoder_config.encoder_config.bert_config_file) - max_sent_length_by_word = dual_encoder_config.encoder_config.max_sent_length_by_word - sent_bert_trainable = dual_encoder_config.encoder_config.sent_bert_trainable - max_predictions_per_seq = dual_encoder_config.encoder_config.max_predictions_per_seq - sent_start = sent_index * max_sent_length_by_word - input_ids_cur_sent = tf.slice(input_ids, [0, sent_start], - [-1, max_sent_length_by_word]) - # Output shape: [batch, max_sent_length_by_word]. - input_mask_cur_sent = tf.slice(input_mask, [0, sent_start], - [-1, max_sent_length_by_word]) - # Output Shape: [batch]. - input_mask_cur_sent_max = tf.reduce_max(input_mask_cur_sent, 1) - # Output Shape: [loop_sent_number_per_doc, batch]. - input_mask_doc_level.append(input_mask_cur_sent_max) - if debugging: - input_ids_cur_sent = tf.Print( - input_ids_cur_sent, [input_ids_cur_sent, input_mask_cur_sent], - message="input_ids_cur_sent in get_sent_reps_masks_lm_loss", - summarize=20) - model = modeling.BertModel( - config=bert_config, - is_training=is_training, - input_ids=input_ids_cur_sent, - input_mask=input_mask_cur_sent, - use_one_hot_embeddings=use_one_hot_embeddings, - sent_bert_trainable=sent_bert_trainable) - with tf.variable_scope("seq_rep_from_bert_sent_dense", reuse=tf.AUTO_REUSE): - normalized_siamese_input_tensor = get_seq_rep_from_bert(model) - input_sent_reps_doc.append(normalized_siamese_input_tensor) - - if (train_mode == constants.TRAIN_MODE_PRETRAIN or - train_mode == constants.TRAIN_MODE_JOINT_TRAIN): - # Collect masked token information for the current sentence. - sent_mask_lm_token_start = sent_index * max_predictions_per_seq - # Output shape: [batch, max_predictions_per_seq]. - masked_lm_positions_cur_sent = tf.slice(masked_lm_positions, - [0, sent_mask_lm_token_start], - [-1, max_predictions_per_seq]) - masked_lm_ids_cur_sent = tf.slice(masked_lm_ids, - [0, sent_mask_lm_token_start], - [-1, max_predictions_per_seq]) - masked_lm_weights_cur_sent = tf.slice(masked_lm_weights, - [0, sent_mask_lm_token_start], - [-1, max_predictions_per_seq]) - # Since in the processed data of smith model, the masked lm positions are - # global indices started from the 1st token of the whole sequence, we need - # to transform this global position to a local position for the current - # sentence. The position index is started from 0. - # Local_index = global_index mod max_sent_length_by_word. - masked_lm_positions_cur_sent = tf.mod(masked_lm_positions_cur_sent, - max_sent_length_by_word) - # Shape of masked_lm_loss_cur_sent [1]. - # Shape of masked_lm_example_loss_cur_sent is [batch, - # max_predictions_per_seq]. - (masked_lm_loss_cur_sent, masked_lm_example_loss_cur_sent, - _) = get_masked_lm_output(bert_config, model.get_sequence_output(), - model.get_embedding_table(), - masked_lm_positions_cur_sent, - masked_lm_ids_cur_sent, - masked_lm_weights_cur_sent) - # Output Shape: [1]. - masked_lm_loss_doc += masked_lm_loss_cur_sent - # Output Shape: [loop_sent_number_per_doc, batch * max_predictions_per_seq]. - masked_lm_example_loss_doc.append(masked_lm_example_loss_cur_sent) - # Output Shape: [loop_sent_number_per_doc, batch, max_predictions_per_seq]. - masked_lm_weights_doc.append(masked_lm_weights_cur_sent) - - return GetSentRepsMasksNormalLoop(input_sent_reps_doc, input_mask_doc_level, masked_lm_loss_doc, - masked_lm_example_loss_doc, masked_lm_weights_doc) - - -class LearnSentRepsNormalLoop(object): - - def __init__(self, input_sent_reps_doc_1_unmask=None, input_mask_doc_level_1_tensor=None, - input_sent_reps_doc_2_unmask=None, input_mask_doc_level_2_tensor=None, - masked_lm_loss_doc_1=None, masked_lm_loss_doc_2=None, - masked_lm_example_loss_doc_1=None, masked_lm_example_loss_doc_2=None, - masked_lm_weights_doc_1=None, masked_lm_weights_doc_2=None): - self.input_sent_reps_doc_1_unmask = input_sent_reps_doc_1_unmask - self.input_mask_doc_level_1_tensor = input_mask_doc_level_1_tensor - self.input_sent_reps_doc_2_unmask = input_sent_reps_doc_2_unmask - self.input_mask_doc_level_2_tensor = input_mask_doc_level_2_tensor - self.masked_lm_loss_doc_1 = masked_lm_loss_doc_1 - self.masked_lm_loss_doc_2 = masked_lm_loss_doc_2 - self.masked_lm_example_loss_doc_1 = masked_lm_example_loss_doc_1 - self.masked_lm_example_loss_doc_2 = masked_lm_example_loss_doc_2 - self.masked_lm_weights_doc_1 = masked_lm_weights_doc_1 - self.masked_lm_weights_doc_2 = masked_lm_weights_doc_2 - - def learn_sent_reps_normal_loop(self, dual_encoder_config, is_training, train_mode, - input_ids_1, input_mask_1, - masked_lm_positions_1, masked_lm_ids_1, - masked_lm_weights_1, input_ids_2, input_mask_2, - masked_lm_positions_2, masked_lm_ids_2, - masked_lm_weights_2, use_one_hot_embeddings): - """Learn the sentence representations with normal loop functions.""" - input_sent_reps_doc_1 = [] - # Generate document level input masks on each sentence based on the word - # level input mask information. - input_mask_doc_level_1 = [] - masked_lm_loss_doc_1 = 0.0 - masked_lm_example_loss_doc_1 = [] - masked_lm_weights_doc_1 = [] - - input_mask_doc_level_2 = [] - input_sent_reps_doc_2 = [] - masked_lm_loss_doc_2 = 0.0 - masked_lm_example_loss_doc_2 = [] - masked_lm_weights_doc_2 = [] - - # Learn the representation for each sentence in the document. - # Setting smaller number of loop_sent_number_per_doc can save memory for the - # model training. - # Shape of masked_lm_loss_doc_1 [1]. - # Shape of masked_lm_example_loss_doc_1 is [max_doc_length_by_sentence, - # batch * max_predictions_per_seq]. - for sent_index in range( - 0, dual_encoder_config.encoder_config.loop_sent_number_per_doc): - sent_reps_masks_normal_loop = GetSentRepsMasksNormalLoop().get_sent_reps_masks_normal_loop( - sent_index, input_sent_reps_doc_1, input_mask_doc_level_1, - masked_lm_loss_doc_1, masked_lm_example_loss_doc_1, - masked_lm_weights_doc_1, dual_encoder_config, is_training, train_mode, - input_ids_1, input_mask_1, masked_lm_positions_1, masked_lm_ids_1, - masked_lm_weights_1, use_one_hot_embeddings) - input_sent_reps_doc_1, input_mask_doc_level_1, masked_lm_loss_doc_1, \ - masked_lm_example_loss_doc_1, masked_lm_weights_doc_1 = \ - sent_reps_masks_normal_loop.input_sent_reps_doc, sent_reps_masks_normal_loop.input_mask_doc_level, \ - sent_reps_masks_normal_loop.masked_lm_loss_doc, \ - sent_reps_masks_normal_loop.masked_lm_example_loss_doc, sent_reps_masks_normal_loop.masked_lm_weights_doc - - sent_reps_masks_normal_loop = GetSentRepsMasksNormalLoop().get_sent_reps_masks_normal_loop( - sent_index, input_sent_reps_doc_2, input_mask_doc_level_2, - masked_lm_loss_doc_2, masked_lm_example_loss_doc_2, - masked_lm_weights_doc_2, dual_encoder_config, is_training, train_mode, - input_ids_2, input_mask_2, masked_lm_positions_2, masked_lm_ids_2, - masked_lm_weights_2, use_one_hot_embeddings) - - input_sent_reps_doc_2, input_mask_doc_level_2, masked_lm_loss_doc_2, \ - masked_lm_example_loss_doc_2, masked_lm_weights_doc_2 = \ - sent_reps_masks_normal_loop.input_sent_reps_doc, sent_reps_masks_normal_loop.input_mask_doc_level, \ - sent_reps_masks_normal_loop.masked_lm_loss_doc, \ - sent_reps_masks_normal_loop.masked_lm_example_loss_doc, sent_reps_masks_normal_loop.masked_lm_weights_doc - - # Stack the sentence representations to learn the doc representations. - # Output Shape: [batch, loop_sent_number_per_doc, hidden]. - input_sent_reps_doc_1_unmask = tf.stack(input_sent_reps_doc_1, axis=1) - input_sent_reps_doc_2_unmask = tf.stack(input_sent_reps_doc_2, axis=1) - - # Output Shape: [batch, loop_sent_number_per_doc]. - input_mask_doc_level_1_tensor = tf.stack(input_mask_doc_level_1, axis=1) - input_mask_doc_level_2_tensor = tf.stack(input_mask_doc_level_2, axis=1) - - if (train_mode == constants.TRAIN_MODE_PRETRAIN or - train_mode == constants.TRAIN_MODE_JOINT_TRAIN): - # Output Shape: [batch * max_predictions_per_seq, - # loop_sent_number_per_doc]. - masked_lm_example_loss_doc_1 = tf.stack( - masked_lm_example_loss_doc_1, axis=1) - masked_lm_example_loss_doc_2 = tf.stack( - masked_lm_example_loss_doc_2, axis=1) - - # Output Shape: [batch, loop_sent_number_per_doc, max_predictions_per_seq]. - masked_lm_weights_doc_1 = tf.stack(masked_lm_weights_doc_1, axis=1) - masked_lm_weights_doc_2 = tf.stack(masked_lm_weights_doc_2, axis=1) - else: - masked_lm_example_loss_doc_1 = tf.zeros([1]) - masked_lm_example_loss_doc_2 = tf.zeros([1]) - masked_lm_weights_doc_1 = tf.zeros([1]) - masked_lm_weights_doc_2 = tf.zeros([1]) - - return LearnSentRepsNormalLoop(input_sent_reps_doc_1_unmask, input_mask_doc_level_1_tensor, - input_sent_reps_doc_2_unmask, input_mask_doc_level_2_tensor, - masked_lm_loss_doc_1, masked_lm_loss_doc_2, - masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2, - masked_lm_weights_doc_1, masked_lm_weights_doc_2) diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/loss_fns.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/loss_fns.py deleted file mode 100644 index aca7b8eefb9ba4d7d96130d832d4c5e298002690..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/loss_fns.py +++ /dev/null @@ -1,98 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Loss functions used in dual encoder SMITH model.""" -from npu_bridge.npu_init import * -import tensorflow.compat.v1 as tf - - -def _pointwise_cosine(encodings_1, encodings_2): - """Pointwise version of cosine similarity function. - - Args: - encodings_1: A 2-D tensor of (left) encodings with shape [batch size, - encoding dim]. - encodings_2: A 2-D tensor of (right) encodings with shape [batch size, - encoding dim]. - - Returns: - A 1-D tensor of cosine similarities with shape [batch size]. - """ - similarities = tf.reduce_sum(tf.multiply(encodings_1, encodings_2), 1) - return similarities - - -def get_prediction_loss_cosine(input_tensor_1, - input_tensor_2, - labels, - similarity_score_amplifier=6.0, - neg_to_pos_example_ratio=1.0): - """Get prediction based on pointwise version of cosine similarity function. - - Compute the model predictions and losses based on cosine similarity functions. - This setting is useful for the binary classification task or regression task. - - Args: - input_tensor_1: The Tensor with shape [batch_size, embed_size] to denote the - left input text. - input_tensor_2: The Tensor with shape [batch_size, embed_size] to denote - the right input text. - labels: Float tensor with shape [batch_size]. The ground truth labels to - denote whether two documents are matched. - similarity_score_amplifier: The amplifier to increase the logits value, so - that sigmoid(logits) is closer to 0 or 1. The default value is 6.0. - neg_to_pos_example_ratio: The ratio to compensate when we have more negative - examples. - - Returns: - The loss, per example loss and similarities of two input texts. - """ - with tf.variable_scope("loss/text_pair_matching"): - logits = _pointwise_cosine(input_tensor_1, input_tensor_2) - - labels = tf.reshape(labels, [-1]) - labels = tf.cast(labels, tf.float32) - # To compensate when we have way more neg examples than pos examples and - # to compensate for larger masked lm loss. - # Note that we use weights_2 to make sure the weight for neg examples is - # 1, not 0. - weights_1 = tf.multiply(labels, neg_to_pos_example_ratio) - weights_2 = tf.add(tf.ones_like(labels), tf.negative(labels)) - # When neg_to_pos_example_ratio = 1.0, weights will be all ones. - weights = tf.add(weights_1, weights_2) - logits *= similarity_score_amplifier - per_example_loss = tf.losses.sigmoid_cross_entropy( - multi_class_labels=labels, - logits=logits, - weights=weights, - reduction=tf.losses.Reduction.NONE) - loss = tf.reduce_mean(per_example_loss) - return (loss, per_example_loss, logits) - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/loss_fns_test.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/loss_fns_test.py deleted file mode 100644 index 6faded34ac7570e0ba37a49bf2df4bea065a82af..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/loss_fns_test.py +++ /dev/null @@ -1,72 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from npu_bridge.npu_init import * -import numpy as np -import tensorflow.compat.v1 as tf -from smith import loss_fns - - -class LossFnsTest(tf.test.TestCase): - - def test_get_prediction_loss_cosine(self): - input_tensor_1 = tf.constant( - [[0.5, 0.7, 0.8, 0.9, 0.1, 0.1], [0.1, 0.3, 0.3, 0.3, 0.1, 0.1]], - dtype=tf.float32) - input_tensor_2 = tf.constant( - [[0.1, 0.2, 0.2, 0.2, 0.2, 0.1], [0.1, 0.4, 0.4, 0.4, 0.1, 0.1]], - dtype=tf.float32) - labels = tf.constant([0, 1.0], dtype=tf.float32) - neg_to_pos_example_ratio = 1.0 - similarity_score_amplifier = 6.0 - loss, per_example_loss, similarities = \ - loss_fns.get_prediction_loss_cosine( - input_tensor_1=input_tensor_1, - input_tensor_2=input_tensor_2, - labels=labels, - similarity_score_amplifier=similarity_score_amplifier, - neg_to_pos_example_ratio=neg_to_pos_example_ratio) - with tf.Session(config=npu_config_proto()) as sess: - sess.run([tf.global_variables_initializer()]) - loss_numpy = sess.run(loss) - per_example_loss_numpy = sess.run(per_example_loss) - similarities_numpy = sess.run(similarities) - self.assertEqual(loss_numpy.shape, ()) - self.assertDTypeEqual(loss_numpy, np.float32) - - self.assertEqual(per_example_loss_numpy.shape, (2,)) - self.assertDTypeEqual(per_example_loss_numpy, np.float32) - - self.assertEqual(similarities_numpy.shape, (2,)) - self.assertDTypeEqual(similarities_numpy, np.float32) - -if __name__ == '__main__': - tf.test.main() - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/metric_fns.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/metric_fns.py deleted file mode 100644 index 06eab776c7bc725da9901c27c553a44bec60660c..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/metric_fns.py +++ /dev/null @@ -1,155 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Metrics used in dual encoder SMITH model.""" -from npu_bridge.npu_init import * - -import tensorflow.compat.v1 as tf - - -def metric_fn_pretrain(masked_lm_example_loss_1, masked_lm_weights_1, - masked_sent_per_example_loss_1, masked_sent_weight_1, - masked_lm_example_loss_2, masked_lm_weights_2, - masked_sent_per_example_loss_2, masked_sent_weight_2, - predicted_class, labels, is_real_example): - """Computes the metrics of the model during pre-training. - - Note that the inputs of this metric_fn should be a list of tensors according - to the documentation of tpu_estimator - https://github.com/tensorflow/estimator/blob/master/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py#L315 - - Args: - masked_lm_example_loss_1: float Tensor with shape [batch * - max_predictions_per_seq]. The per example loss for masked token LM - prediction from sequence 1. - masked_lm_weights_1: float Tensor with shape [batch, - max_predictions_per_seq]. The weights of masked tokens from sequence 1. - masked_sent_per_example_loss_1: float Tensor with shape [batch * - max_masked_sent_per_doc]. The per example los for masked sentence LM - prediction from sequence 1. - masked_sent_weight_1: float Tensor with shape [batch, - max_masked_sent_per_doc]. The weights of masked sentences from sequence 1. - masked_lm_example_loss_2: float Tensor with shape [batch * - max_predictions_per_seq]. The per example loss for masked token LM - prediction from sequence 2. - masked_lm_weights_2: float Tensor with shape [batch, - max_predictions_per_seq]. The weights of masked tokens from sequence 2. - masked_sent_per_example_loss_2: float Tensor with shape [batch * - max_masked_sent_per_doc]. The per example los for masked sentence LM - prediction from sequence 2. - masked_sent_weight_2: float Tensor with shape [batch, - max_masked_sent_per_doc]. The weights of masked sentences from sequence 2. - predicted_class: int Tensor with shape [batch]. The predicted class for each - example in the batch. - labels: float Tensor with shape [batch]. The ground truth label for each - example in the batch. - is_real_example: float Tensor with shape [batch]. The Tensor to indicate - whether an example is a real example or a padded fake example. It will be - used as the weights in the metrics computation. - - Returns: - The metrics dict to be used in the evaluation metrics. - """ - masked_lm_example_loss_1 = tf.reshape(masked_lm_example_loss_1, [-1]) - masked_lm_weights_1 = tf.reshape(masked_lm_weights_1, [-1]) - masked_lm_mean_loss_1 = tf.metrics.mean( - values=masked_lm_example_loss_1, weights=masked_lm_weights_1) - masked_lm_example_loss_2 = tf.reshape(masked_lm_example_loss_2, [-1]) - masked_lm_weights_2 = tf.reshape(masked_lm_weights_2, [-1]) - masked_lm_mean_loss_2 = tf.metrics.mean( - values=masked_lm_example_loss_2, weights=masked_lm_weights_2) - metrics_dict = { - "masked_lm_loss_1": masked_lm_mean_loss_1, - "masked_lm_loss_2": masked_lm_mean_loss_2, - } - labels = tf.reshape(labels, [-1]) - predicted_class = tf.reshape(predicted_class, [-1]) - accuracy = tf.metrics.accuracy( - labels=labels, predictions=predicted_class, weights=is_real_example) - metrics_dict["accuracy"] = accuracy - masked_sent_per_example_loss_1 = tf.reshape(masked_sent_per_example_loss_1, - [-1]) - masked_sent_weight_1 = tf.reshape(masked_sent_weight_1, [-1]) - masked_sent_lm_mean_loss_1 = tf.metrics.mean( - values=masked_sent_per_example_loss_1, weights=masked_sent_weight_1) - metrics_dict["masked_sent_lm_loss_1"] = masked_sent_lm_mean_loss_1 - masked_sent_per_example_loss_2 = tf.reshape(masked_sent_per_example_loss_2, - [-1]) - masked_sent_weight_2 = tf.reshape(masked_sent_weight_2, [-1]) - masked_sent_lm_mean_loss_2 = tf.metrics.mean( - values=masked_sent_per_example_loss_2, weights=masked_sent_weight_2) - metrics_dict["masked_sent_lm_loss_2"] = masked_sent_lm_mean_loss_2 - return metrics_dict - - -def metric_fn_finetune(predicted_class, labels, siamese_example_loss, - is_real_example): - """Computes the metrics of the model during fine-tuning. - - Note that the inputs of this metric_fn should be a list of tensors according - to the documentation of tpu_estimator - https://github.com/tensorflow/estimator/blob/master/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py#L315 - - Args: - predicted_class: int Tensor with shape [batch]. The predicted class for each - example in the batch. - labels: float Tensor with shape [batch]. The ground truth label for each - example in the batch. - siamese_example_loss: float Tensor with shape [batch]. The per example text - pair matching loss. - is_real_example: float Tensor with shape [batch]. The Tensor to indicate - whether an example is a real example or a padded fake example. It will be - used as the weights in the metrics computation. - - Returns: - The metrics dict to be used in the evaluation metrics. - """ - tf.logging.info("***** predicted_class %s *****", str(predicted_class)) - tf.logging.info("***** labels %s *****", str(labels)) - tf.logging.info("***** siamese_example_loss %s *****", str(siamese_example_loss)) - tf.logging.info("***** is_real_example %s *****", str(is_real_example)) - - labels = tf.reshape(labels, [-1]) - siamese_loss = tf.metrics.mean( - values=siamese_example_loss, weights=is_real_example) - accuracy = tf.metrics.accuracy( - labels=labels, predictions=predicted_class, weights=is_real_example) - precision = tf.metrics.precision( - labels=labels, predictions=predicted_class, weights=is_real_example) - recall = tf.metrics.recall( - labels=labels, predictions=predicted_class, weights=is_real_example) - metrics_dict = { - "accuracy": accuracy, - "siamese_loss": siamese_loss, - "precision": precision, - "recall": recall - } - return metrics_dict - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/metric_fns_test.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/metric_fns_test.py deleted file mode 100644 index 7fe7e8fa3835dfd330f47a95459359e0f9c7ccab..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/metric_fns_test.py +++ /dev/null @@ -1,132 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from npu_bridge.npu_init import * -import numpy as np -import tensorflow.compat.v1 as tf -from smith import metric_fns - - -class MetricFnsTest(tf.test.TestCase): - - def test_metric_fn_pretrain(self): - masked_lm_example_loss_1 = tf.constant([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]) - masked_lm_weights_1 = tf.constant([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) - masked_sent_per_example_loss_1 = tf.constant( - [[0.3, 0.3, 0.1, 0.2, 0.2, 0.1]]) - masked_sent_weight_1 = tf.constant([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) - masked_lm_example_loss_2 = tf.constant([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]) - masked_lm_weights_2 = tf.constant([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) - masked_sent_per_example_loss_2 = tf.constant( - [[0.3, 0.3, 0.1, 0.2, 0.2, 0.1]]) - masked_sent_weight_2 = tf.constant([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) - labels = tf.constant([1, 0]) - predicted_class = tf.constant([0, 0]) - is_real_example = tf.constant([1.0, 1.0]) - metrics_dict = metric_fns.metric_fn_pretrain( - masked_lm_example_loss_1=masked_lm_example_loss_1, - masked_lm_weights_1=masked_lm_weights_1, - masked_sent_per_example_loss_1=masked_sent_per_example_loss_1, - masked_sent_weight_1=masked_sent_weight_1, - masked_lm_example_loss_2=masked_lm_example_loss_2, - masked_lm_weights_2=masked_lm_weights_2, - masked_sent_per_example_loss_2=masked_sent_per_example_loss_2, - masked_sent_weight_2=masked_sent_weight_2, - predicted_class=predicted_class, - labels=labels, - is_real_example=is_real_example) - init_g = tf.global_variables_initializer() - init_l = tf.local_variables_initializer() - with tf.Session(config=npu_config_proto()) as sess: - sess.run([init_g, init_l]) - # Runs update_op in metrics before checking the values. - sess.run(metrics_dict) - metrics_dict_numpy = sess.run(metrics_dict) - self.assertEqual(metrics_dict_numpy["masked_lm_loss_1"][1].shape, ()) - self.assertAllClose(metrics_dict_numpy["masked_lm_loss_1"][1], 0.1) - self.assertDTypeEqual(metrics_dict_numpy["masked_lm_loss_1"][1], - np.float32) - - self.assertEqual(metrics_dict_numpy["masked_lm_loss_2"][1].shape, ()) - self.assertAllClose(metrics_dict_numpy["masked_lm_loss_2"][1], 0.1) - self.assertDTypeEqual(metrics_dict_numpy["masked_lm_loss_2"][1], - np.float32) - - self.assertEqual(metrics_dict_numpy["masked_sent_lm_loss_1"][1].shape, ()) - self.assertAllClose(metrics_dict_numpy["masked_sent_lm_loss_1"][1], 0.2) - self.assertDTypeEqual(metrics_dict_numpy["masked_sent_lm_loss_1"][1], - np.float32) - - self.assertEqual(metrics_dict_numpy["masked_sent_lm_loss_2"][1].shape, ()) - self.assertAllClose(metrics_dict_numpy["masked_sent_lm_loss_2"][1], 0.2) - self.assertDTypeEqual(metrics_dict_numpy["masked_sent_lm_loss_2"][1], - np.float32) - - self.assertEqual(metrics_dict_numpy["accuracy"][1].shape, ()) - self.assertAllClose(metrics_dict_numpy["accuracy"][1], 0.5) - self.assertDTypeEqual(metrics_dict_numpy["accuracy"][1], - np.float32) - - def test_metric_fn_finetune_binary_classification(self): - labels = tf.constant([1, 0, 1, 1]) - predicted_class = tf.constant([0, 0, 0, 1]) - siamese_example_loss = tf.constant([0.1, 0.2, 0.3, 0.4]) - is_real_example = tf.constant([1.0, 1.0, 1.0, 1.0]) - metrics_dict = metric_fns.metric_fn_finetune( - predicted_class=predicted_class, - labels=labels, - siamese_example_loss=siamese_example_loss, - is_real_example=is_real_example) - init_g = tf.global_variables_initializer() - init_l = tf.local_variables_initializer() - with tf.Session(config=npu_config_proto()) as sess: - sess.run([init_g, init_l]) - # Runs update_op in metrics before checking the values. - sess.run(metrics_dict) - metrics_dict_numpy = sess.run(metrics_dict) - self.assertEqual(metrics_dict_numpy["accuracy"][1].shape, ()) - self.assertAllClose(metrics_dict_numpy["accuracy"][1], 0.5) - self.assertDTypeEqual(metrics_dict_numpy["accuracy"][1], np.float32) - - self.assertEqual(metrics_dict_numpy["precision"][1].shape, ()) - self.assertAllClose(metrics_dict_numpy["precision"][1], 1) - self.assertDTypeEqual(metrics_dict_numpy["precision"][1], np.float32) - - self.assertEqual(metrics_dict_numpy["recall"][1].shape, ()) - self.assertAllClose(metrics_dict_numpy["recall"][1], 0.333333) - self.assertDTypeEqual(metrics_dict_numpy["recall"][1], np.float32) - - self.assertEqual(metrics_dict_numpy["siamese_loss"][1].shape, ()) - self.assertAllClose(metrics_dict_numpy["siamese_loss"][1], 0.25) - self.assertDTypeEqual(metrics_dict_numpy["siamese_loss"][1], np.float32) - -if __name__ == "__main__": - tf.test.main() - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling.py deleted file mode 100644 index 96b6fa204a67bcf7e949fd93339dffd79d842e92..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling.py +++ /dev/null @@ -1,547 +0,0 @@ -# coding=utf-8 -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Dual encoder SMITH models.""" -from npu_bridge.npu_init import * - -import tensorflow.compat.v1 as tf - -from smith import constants -from smith import layers -from smith import loss_fns -from smith import metric_fns -from smith import utils -from smith import modeling -from smith.bert import optimization - -import precision_tool.tf_config as npu_tf_config - -class BuildSmithDualEncoder(object): - - def __init__(self, masked_lm_loss_doc_1=None, masked_lm_loss_doc_2=None, - masked_lm_example_loss_doc_1=None, masked_lm_example_loss_doc_2=None, - masked_lm_weights_doc_1=None, masked_lm_weights_doc_2=None, - masked_sent_lm_loss_1=None, masked_sent_lm_loss_2=None, - masked_sent_per_example_loss_1=None, masked_sent_per_example_loss_2=None, - masked_sent_weight_1=None, masked_sent_weight_2=None, final_doc_rep_1=None, - final_doc_rep_2=None, input_sent_reps_doc_1_unmask=None, - input_sent_reps_doc_2_unmask=None, output_sent_reps_doc_1=None, - output_sent_reps_doc_2=None, siamese_loss=None, siamese_example_loss=None, - siamese_logits=None): - self.masked_lm_loss_doc_1 = masked_lm_loss_doc_1 - self.masked_lm_loss_doc_2 = masked_lm_loss_doc_2 - self.masked_lm_example_loss_doc_1 = masked_lm_example_loss_doc_1 - self.masked_lm_example_loss_doc_2 = masked_lm_example_loss_doc_2 - self.masked_lm_weights_doc_1 = masked_lm_weights_doc_1 - self.masked_lm_weights_doc_2 = masked_lm_weights_doc_2 - self.masked_sent_lm_loss_1 = masked_sent_lm_loss_1 - self.masked_sent_lm_loss_2 = masked_sent_lm_loss_2 - self.masked_sent_per_example_loss_1 = masked_sent_per_example_loss_1 - self.masked_sent_per_example_loss_2 = masked_sent_per_example_loss_2 - self.masked_sent_weight_1 = masked_sent_weight_1 - self.masked_sent_weight_2 = masked_sent_weight_2 - self.final_doc_rep_1 = final_doc_rep_1 - self.final_doc_rep_2 = final_doc_rep_2 - self.input_sent_reps_doc_1_unmask = input_sent_reps_doc_1_unmask - self.input_sent_reps_doc_2_unmask = input_sent_reps_doc_2_unmask - self.output_sent_reps_doc_1 = output_sent_reps_doc_1 - self.output_sent_reps_doc_2 = output_sent_reps_doc_2 - self.siamese_loss = siamese_loss - self.siamese_example_loss = siamese_example_loss - self.siamese_logits = siamese_logits - - def build_smith_dual_encoder(self, dual_encoder_config, - train_mode, - is_training, - input_ids_1, - input_mask_1, - masked_lm_positions_1, - masked_lm_ids_1, - masked_lm_weights_1, - input_ids_2, - input_mask_2, - masked_lm_positions_2, - masked_lm_ids_2, - masked_lm_weights_2, - use_one_hot_embeddings, - documents_match_labels, - debugging=False): - """Build the dual encoder SMITH model. - - Args: - dual_encoder_config: the configuration file for the dual encoder model. - train_mode: string. The train mode of the current. It can be finetune, - pretrain or joint_train. - is_training: bool. Whether it in training mode. - input_ids_1: int Tensor with shape [batch, max_seq_length]. The input ids of - input examples of text 1. - input_mask_1: int Tensor with shape [batch, max_seq_length]. The input masks - of input examples of text 1. - masked_lm_positions_1: int Tensor with shape [batch, - max_predictions_per_seq]. The input masked LM prediction positions of - input examples of text 1. This can be useful to compute the masked word - prediction LM loss. - masked_lm_ids_1: int Tensor with shape [batch, max_predictions_per_seq]. The - input masked LM prediction ids of input examples of text 1. It is the - ground truth in the masked word LM prediction task. This can be useful to - compute the masked word prediction LM loss. - masked_lm_weights_1: float Tensor with shape [batch, - max_predictions_per_seq]. The input masked LM prediction weights of input - examples of text 1. - input_ids_2: int Tensor with shape [batch, max_seq_length]. The input ids of - input examples of text 2. - input_mask_2: int Tensor with shape [batch, max_seq_length]. The input masks - of input examples of text 2. - masked_lm_positions_2: int Tensor with shape [batch, - max_predictions_per_seq]. The input masked LM prediction positions of - input examples of text 2. This can be useful to compute the masked word - prediction LM loss. - masked_lm_ids_2: int Tensor with shape [batch, max_predictions_per_seq]. The - input masked LM prediction ids of input examples of text 2. It is the - ground truth in the masked word LM prediction task. This can be useful to - compute the masked word prediction LM loss. - masked_lm_weights_2: float Tensor with shape [batch, - max_predictions_per_seq]. The input masked LM prediction weights of input - examples of text 2. - use_one_hot_embeddings: bool. Whether use one hot embeddings. - documents_match_labels: float Tensor with shape [batch]. The ground truth - labels for the input examples. - debugging: bool. Whether it is in the debugging mode. - - Returns: - The masked LM loss, per example LM loss, masked sentence LM loss, per - example masked sentence LM loss, sequence representations, text matching - loss, per example text matching loss, text matching logits, text matching - probabilities and text matching log probabilities. - - Raises: - ValueError: if the doc_rep_combine_mode in dual_encoder_config is invalid. - """ - - bert_config = modeling.BertConfig.from_json_file( - dual_encoder_config.encoder_config.bert_config_file) - doc_bert_config = modeling.BertConfig.from_json_file( - dual_encoder_config.encoder_config.doc_bert_config_file) - learn_sent_reps_normal_loop = layers.LearnSentRepsNormalLoop().learn_sent_reps_normal_loop( - dual_encoder_config, is_training, train_mode, input_ids_1, input_mask_1, - masked_lm_positions_1, masked_lm_ids_1, masked_lm_weights_1, input_ids_2, - input_mask_2, masked_lm_positions_2, masked_lm_ids_2, - masked_lm_weights_2, use_one_hot_embeddings) - input_sent_reps_doc_1_unmask, input_mask_doc_level_1_tensor, \ - input_sent_reps_doc_2_unmask, input_mask_doc_level_2_tensor, masked_lm_loss_doc_1, \ - masked_lm_loss_doc_2, masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2, \ - masked_lm_weights_doc_1, masked_lm_weights_doc_2 = \ - learn_sent_reps_normal_loop.input_sent_reps_doc_1_unmask, \ - learn_sent_reps_normal_loop.input_mask_doc_level_1_tensor, \ - learn_sent_reps_normal_loop.input_sent_reps_doc_2_unmask, \ - learn_sent_reps_normal_loop.input_mask_doc_level_2_tensor, \ - learn_sent_reps_normal_loop.masked_lm_loss_doc_1, learn_sent_reps_normal_loop.masked_lm_loss_doc_2, \ - learn_sent_reps_normal_loop.masked_lm_example_loss_doc_1, \ - learn_sent_reps_normal_loop.masked_lm_example_loss_doc_2, \ - learn_sent_reps_normal_loop.masked_lm_weights_doc_1, learn_sent_reps_normal_loop.masked_lm_weights_doc_2 - - if debugging: - input_mask_doc_level_1_tensor = tf.Print( - input_mask_doc_level_1_tensor, - [input_mask_doc_level_1_tensor, input_mask_doc_level_2_tensor], - message="input_mask_doc_level_1_tensor in build_smith_dual_encoder", - summarize=30) - - if dual_encoder_config.encoder_config.use_masked_sentence_lm_loss: - batch_size_static = ( - dual_encoder_config.train_eval_config.train_batch_size if is_training - else dual_encoder_config.train_eval_config.eval_batch_size) - # Generates the sentence masked document represenations. - with tf.variable_scope("mask_sent_in_doc", reuse=tf.AUTO_REUSE): - # Randomly initialize a masked sentence vector and reuse it. - # We also need to return the masked sentence position index to get the - # ground truth labels for the masked positions. The shape of - # sent_mask_embedding is [hidden]. - sent_mask_embedding = tf.get_variable( - name="sentence_mask_embedding", - shape=[bert_config.hidden_size], - initializer=tf.truncated_normal_initializer( - stddev=bert_config.initializer_range)) - # Output Shape: [batch, loop_sent_number_per_doc, hidden]. - (input_sent_reps_doc_1_masked, masked_sent_index_1, - masked_sent_weight_1) = layers.get_doc_rep_with_masked_sent( - input_sent_reps_doc=input_sent_reps_doc_1_unmask, - sent_mask_embedding=sent_mask_embedding, - input_mask_doc_level=input_mask_doc_level_1_tensor, - batch_size_static=batch_size_static, - max_masked_sent_per_doc=dual_encoder_config.encoder_config - .max_masked_sent_per_doc, - loop_sent_number_per_doc=dual_encoder_config.encoder_config - .loop_sent_number_per_doc) - (input_sent_reps_doc_2_masked, masked_sent_index_2, - masked_sent_weight_2) = layers.get_doc_rep_with_masked_sent( - input_sent_reps_doc=input_sent_reps_doc_2_unmask, - sent_mask_embedding=sent_mask_embedding, - input_mask_doc_level=input_mask_doc_level_2_tensor, - batch_size_static=batch_size_static, - max_masked_sent_per_doc=dual_encoder_config.encoder_config - .max_masked_sent_per_doc, - loop_sent_number_per_doc=dual_encoder_config.encoder_config - .loop_sent_number_per_doc) - # Learn the document representations based on masked sentence embeddings. - # Note that the variables in the DocBert model are not within the - # "mask_sent_in_doc" variable scope. - model_doc_1 = modeling.DocBertModel( - config=doc_bert_config, - is_training=is_training, - input_reps=input_sent_reps_doc_1_masked, - input_mask=input_mask_doc_level_1_tensor) - model_doc_2 = modeling.DocBertModel( - config=doc_bert_config, - is_training=is_training, - input_reps=input_sent_reps_doc_2_masked, - input_mask=input_mask_doc_level_2_tensor) - # Shape of masked_sent_lm_loss_1 [1]. - # Shape of masked_sent_lm_example_loss_1 is [batch * - # max_predictions_per_seq]. - (masked_sent_lm_loss_1, masked_sent_per_example_loss_1, - _) = layers.get_masked_sent_lm_output(doc_bert_config, - model_doc_1.get_sequence_output(), - input_sent_reps_doc_1_unmask, - masked_sent_index_1, - masked_sent_weight_1) - (masked_sent_lm_loss_2, masked_sent_per_example_loss_2, - _) = layers.get_masked_sent_lm_output(doc_bert_config, - model_doc_2.get_sequence_output(), - input_sent_reps_doc_2_unmask, - masked_sent_index_2, - masked_sent_weight_2) - else: - # Learn the document representations based on unmasked sentence embeddings. - model_doc_1 = modeling.DocBertModel( - config=doc_bert_config, - is_training=is_training, - input_reps=input_sent_reps_doc_1_unmask, - input_mask=input_mask_doc_level_1_tensor) - model_doc_2 = modeling.DocBertModel( - config=doc_bert_config, - is_training=is_training, - input_reps=input_sent_reps_doc_2_unmask, - input_mask=input_mask_doc_level_2_tensor) - masked_sent_lm_loss_1 = 0 - masked_sent_lm_loss_2 = 0 - masked_sent_per_example_loss_1 = tf.zeros(1) - masked_sent_per_example_loss_2 = tf.zeros(1) - masked_sent_weight_1 = tf.zeros(1) - masked_sent_weight_2 = tf.zeros(1) - - with tf.variable_scope("seq_rep_from_bert_doc_dense", reuse=tf.AUTO_REUSE): - normalized_doc_rep_1 = layers.get_seq_rep_from_bert(model_doc_1) - normalized_doc_rep_2 = layers.get_seq_rep_from_bert(model_doc_2) - - # We also dump the contextualized sentence embedding output by document - # level Transformer model. These representations maybe useful for sentence - # level tasks. - output_sent_reps_doc_1 = model_doc_1.get_sequence_output() - output_sent_reps_doc_2 = model_doc_2.get_sequence_output() - - # Here we support multiple modes to generate the final document - # representations based on the word/sentence/document level representations - # 1. normal: only use the document level representation as the final document - # representations. - # 2. sum_concat: firstly compute the sum of all sentence level repsentations. - # Then concatenate the sum vector with the document level representations. - # 3. mean_concat: firstly compute the mean of all sentence level - # repsentations. Then concatenate the mean vector with the document level - # representations. - # 4. attention: firstly compute the weighted sum of sentence level - # representations with attention mechanism, then concatenate the weighted sum - # vector with the document level representations. - # The document level mask is to indicate whether each sentence is - # a real sentence (1) or a paded sentence (0). The shape of - # input_mask_doc_level_1_tensor is [batch, max_doc_length_by_sentence]. The - # shape of input_sent_reps_doc_1_unmask is - # [batch, max_doc_length_by_sentence, hidden]. - final_doc_rep_combine_mode = dual_encoder_config.encoder_config.doc_rep_combine_mode - if final_doc_rep_combine_mode == constants.DOC_COMBINE_NORMAL: - final_doc_rep_1 = normalized_doc_rep_1 - final_doc_rep_2 = normalized_doc_rep_2 - elif final_doc_rep_combine_mode == constants.DOC_COMBINE_SUM_CONCAT: - # Output Shape: [batch, 2*hidden]. - final_doc_rep_1 = tf.concat( - [tf.reduce_sum(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1], - axis=1) - final_doc_rep_2 = tf.concat( - [tf.reduce_sum(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2], - axis=1) - elif final_doc_rep_combine_mode == constants.DOC_COMBINE_MEAN_CONCAT: - final_doc_rep_1 = tf.concat( - [tf.reduce_mean(input_sent_reps_doc_1_unmask, 1), normalized_doc_rep_1], - axis=1) - final_doc_rep_2 = tf.concat( - [tf.reduce_mean(input_sent_reps_doc_2_unmask, 1), normalized_doc_rep_2], - axis=1) - elif final_doc_rep_combine_mode == constants.DOC_COMBINE_ATTENTION: - final_doc_rep_1 = tf.concat([ - layers.get_attention_weighted_sum( - input_sent_reps_doc_1_unmask, bert_config, is_training, - dual_encoder_config.encoder_config.doc_rep_combine_attention_size), - normalized_doc_rep_1 - ], - axis=1) - final_doc_rep_2 = tf.concat([ - layers.get_attention_weighted_sum( - input_sent_reps_doc_2_unmask, bert_config, is_training, - dual_encoder_config.encoder_config.doc_rep_combine_attention_size), - normalized_doc_rep_2 - ], - axis=1) - else: - raise ValueError("Only normal, sum_concat, mean_concat and attention are" - " supported: %s" % final_doc_rep_combine_mode) - (siamese_loss, siamese_example_loss, - siamese_logits) = loss_fns.get_prediction_loss_cosine( - input_tensor_1=final_doc_rep_1, - input_tensor_2=final_doc_rep_2, - labels=documents_match_labels, - similarity_score_amplifier=dual_encoder_config.loss_config - .similarity_score_amplifier, - neg_to_pos_example_ratio=dual_encoder_config.train_eval_config - .neg_to_pos_example_ratio) - - # The shape of masked_lm_loss_doc is [1]. - # The shape of masked_lm_example_loss_doc is [batch * max_predictions_per_seq, - # max_doc_length_by_sentence]. - - return BuildSmithDualEncoder(masked_lm_loss_doc_1, masked_lm_loss_doc_2, - masked_lm_example_loss_doc_1, masked_lm_example_loss_doc_2, - masked_lm_weights_doc_1, masked_lm_weights_doc_2, - masked_sent_lm_loss_1, masked_sent_lm_loss_2, - masked_sent_per_example_loss_1, masked_sent_per_example_loss_2, - masked_sent_weight_1, masked_sent_weight_2, final_doc_rep_1, - final_doc_rep_2, input_sent_reps_doc_1_unmask, - input_sent_reps_doc_2_unmask, output_sent_reps_doc_1, - output_sent_reps_doc_2, siamese_loss, siamese_example_loss, - siamese_logits) - - -def model_fn_builder(dual_encoder_config, - train_mode, - learning_rate, - num_train_steps, - num_warmup_steps, - use_tpu, - use_one_hot_embeddings, - debugging=False): - """Returns `model_fn` closure for TPUEstimator.""" - - def model_fn(features, labels, mode, params): # pylint: disable=unused-argument - """The `model_fn` for TPUEstimator.""" - tf.logging.info("*** Current mode: %s ***" % mode) - tf.logging.info("*** Features ***") - for name in sorted(features.keys()): - tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) - - input_ids_1 = features["input_ids_1"] - input_mask_1 = features["input_mask_1"] - if train_mode == constants.TRAIN_MODE_FINETUNE: - masked_lm_positions_1 = tf.zeros([1]) - masked_lm_ids_1 = tf.zeros([1]) - masked_lm_weights_1 = tf.zeros([1]) - else: - masked_lm_positions_1 = features["masked_lm_positions_1"] - masked_lm_ids_1 = features["masked_lm_ids_1"] - masked_lm_weights_1 = features["masked_lm_weights_1"] - - input_ids_2 = features["input_ids_2"] - input_mask_2 = features["input_mask_2"] - if train_mode == constants.TRAIN_MODE_FINETUNE: - masked_lm_positions_2 = tf.zeros([1]) - masked_lm_ids_2 = tf.zeros([1]) - masked_lm_weights_2 = tf.zeros([1]) - else: - masked_lm_positions_2 = features["masked_lm_positions_2"] - masked_lm_ids_2 = features["masked_lm_ids_2"] - masked_lm_weights_2 = features["masked_lm_weights_2"] - documents_match_labels = features["documents_match_labels"] - # Since the document_match_labels might contain labels like 0/1/2, we need - # to transfer these labels to binary labels like 0/1. - documents_match_labels = tf.cast(documents_match_labels > 0, tf.float32) - is_real_example = None - if "is_real_example" in features: - is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) - else: - is_real_example = tf.ones( - tf.shape(documents_match_labels), dtype=tf.float32) - - is_training = (mode == tf.estimator.ModeKeys.TRAIN) - - if (dual_encoder_config.encoder_config.model_name == - constants.MODEL_NAME_SMITH_DUAL_ENCODER): - # For the smith model, since the actual looped number of sentences per - # document maybe smaller than max_doc_length_by_sentence, we need to - # overwrite the lm weights with the actual lm weights returned by the - # function. - smith_dual_encoder = BuildSmithDualEncoder().build_smith_dual_encoder( - dual_encoder_config, train_mode, is_training, input_ids_1, - input_mask_1, masked_lm_positions_1, masked_lm_ids_1, - masked_lm_weights_1, input_ids_2, input_mask_2, - masked_lm_positions_2, masked_lm_ids_2, masked_lm_weights_2, - use_one_hot_embeddings, documents_match_labels, debugging) - - masked_lm_loss_1, masked_lm_loss_2, masked_lm_example_loss_1, masked_lm_example_loss_2, \ - masked_lm_weights_1, masked_lm_weights_2, masked_sent_lm_loss_1, masked_sent_lm_loss_2, \ - masked_sent_per_example_loss_1, masked_sent_per_example_loss_2, masked_sent_weight_1, \ - masked_sent_weight_2, seq_embed_1, seq_embed_2, input_sent_embed_1, input_sent_embed_2, \ - output_sent_embed_1, output_sent_embed_2, siamese_loss, siamese_example_loss, siamese_logits = \ - smith_dual_encoder.masked_lm_loss_doc_1, smith_dual_encoder.masked_lm_loss_doc_2, \ - smith_dual_encoder.masked_lm_example_loss_doc_1, smith_dual_encoder.masked_lm_example_loss_doc_2, \ - smith_dual_encoder.masked_lm_weights_doc_1, smith_dual_encoder.masked_lm_weights_doc_2, \ - smith_dual_encoder.masked_sent_lm_loss_1, smith_dual_encoder.masked_sent_lm_loss_2, \ - smith_dual_encoder.masked_sent_per_example_loss_1, smith_dual_encoder.masked_sent_per_example_loss_2, \ - smith_dual_encoder.masked_sent_weight_1, smith_dual_encoder.masked_sent_weight_2, \ - smith_dual_encoder.final_doc_rep_1, smith_dual_encoder.final_doc_rep_2, \ - smith_dual_encoder.input_sent_reps_doc_1_unmask, \ - smith_dual_encoder.input_sent_reps_doc_2_unmask, \ - smith_dual_encoder.output_sent_reps_doc_1, smith_dual_encoder.output_sent_reps_doc_2, \ - smith_dual_encoder.siamese_loss, \ - smith_dual_encoder.siamese_example_loss, smith_dual_encoder.siamese_logits - - else: - raise ValueError( - "Only smith_dual_encoder is supported: %s" % - dual_encoder_config.encoder_config.model_name) - - # There are three different modes for training in the smith model. - # 1. joint_train: a multi-task learning setting which combines the masked - # word LM losses for doc1/doc2 and the siamese matching loss. If we add the - # masked sentence LM task, we also add the masked sentence LM losses for - # the two documents. - # 2. pretrain: only contains the masked word LM losses for doc1/doc2. We - # currently didn't include the NSP loss since NSP loss is not very useful - # according to the XLNet/ RoBERTa/ ALBERT paper. If we add the masked - # sentence LM task, we also add the masked sentence LM losses for the - # two documents. - # 3. finetune: fine tune the model with loaded pretrained checkpoint only - # with the siamese matching loss. If we add the masked sentence LM task, - # we also add the masked sentence LM losses for the two documents. - if train_mode == constants.TRAIN_MODE_JOINT_TRAIN: - total_loss = masked_lm_loss_1 + masked_lm_loss_2 + siamese_loss - elif train_mode == constants.TRAIN_MODE_PRETRAIN: - total_loss = masked_lm_loss_1 + masked_lm_loss_2 - elif train_mode == constants.TRAIN_MODE_FINETUNE: - total_loss = siamese_loss - else: - raise ValueError("Only joint_train, pretrain, finetune are supported.") - # If we add the masked sentence LM task, we also add the masked sentence - # LM losses for the two documents. - if dual_encoder_config.encoder_config.use_masked_sentence_lm_loss: - total_loss += (masked_sent_lm_loss_1 + masked_sent_lm_loss_2) - - total_loss = tf.identity(total_loss, name='total_loss') - - tvars = tf.trainable_variables() - initialized_variable_names = {} - scaffold_fn = None - init_checkpoint = dual_encoder_config.encoder_config.init_checkpoint - # Load pretrained BERT checkpoints if there is a specified path. - if init_checkpoint: - tf.logging.info("**** Passed pretrained BERT checkpoint = %s ****", - init_checkpoint) - (assignment_map, initialized_variable_names - ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) - if use_tpu: - - def tpu_scaffold(): - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - return tf.train.Scaffold() - - scaffold_fn = tpu_scaffold - else: - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - - tf.logging.info("**** Trainable Variables ****") - for var in tvars: - init_string = ", *INIT_RANDOMLY*" - if var.name in initialized_variable_names: - init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, - init_string) - output_spec = None - predicted_score = tf.sigmoid(siamese_logits) - predicted_class = tf.round(predicted_score) - - if dual_encoder_config.encoder_config.model_name == constants.MODEL_NAME_SMITH_DUAL_ENCODER: - _, prediction_dict = utils.get_export_outputs_prediction_dict_smith_de( - seq_embed_1, seq_embed_2, predicted_score, predicted_class, - documents_match_labels, input_sent_embed_1, input_sent_embed_2, - output_sent_embed_1, output_sent_embed_2) - else: - raise ValueError("Unsupported model: %s" % dual_encoder_config.encoder_config.model_name) - - if mode == tf.estimator.ModeKeys.TRAIN: - train_op = optimization.create_optimizer(total_loss, learning_rate, - num_train_steps, - num_warmup_steps, use_tpu) - # Add by:TC 20220705 - output_spec = tf.estimator.EstimatorSpec( - mode=mode, - loss=total_loss, - train_op=train_op, - training_hooks=[npu_tf_config.estimator_dump()]) - - elif mode == tf.estimator.ModeKeys.EVAL: - if (train_mode == constants.TRAIN_MODE_JOINT_TRAIN or - train_mode == constants.TRAIN_MODE_PRETRAIN): - eval_metrics = (metric_fns.metric_fn_pretrain, [ - masked_lm_example_loss_1, masked_lm_weights_1, - masked_sent_per_example_loss_1, masked_sent_weight_1, - masked_lm_example_loss_2, masked_lm_weights_2, - masked_sent_per_example_loss_2, masked_sent_weight_2, - predicted_class, documents_match_labels, is_real_example - ]) - elif train_mode == constants.TRAIN_MODE_FINETUNE: - eval_metrics = (metric_fns.metric_fn_finetune, [ - predicted_class, documents_match_labels, siamese_example_loss, - is_real_example - ]) - else: - raise ValueError("Only joint_train, pretrain, finetune are supported.") - output_spec = tf.estimator.tpu.TPUEstimatorSpec( - mode=mode, - loss=total_loss, - eval_metrics=eval_metrics, - scaffold_fn=scaffold_fn) - - elif mode == tf.estimator.ModeKeys.PREDICT: - output_spec = tf.estimator.tpu.TPUEstimatorSpec( - mode=mode, predictions=prediction_dict, scaffold_fn=scaffold_fn) - else: - raise ValueError("Only TRAIN, EVAL, PREDICT modes are supported: %s" % mode) - - return output_spec - - return model_fn - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_test.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_test.py deleted file mode 100644 index e1ed2fb2f44fcdb5f6a07f985b111bfaeb1af3d3..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/modeling_test.py +++ /dev/null @@ -1,199 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from npu_bridge.npu_init import * -import json -import tempfile -from absl import flags -import numpy as np -import tensorflow.compat.v1 as tf -from smith import constants -from smith import experiment_config_pb2 -from smith import modeling - -FLAGS = flags.FLAGS - - -class ModelingTest(tf.test.TestCase): - - def setUp(self): - super(ModelingTest, self).setUp() - bert_config = { - "attention_probs_dropout_prob": 0.1, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.1, - "hidden_size": 16, - "initializer_range": 0.02, - "intermediate_size": 32, - "max_position_embeddings": 16, - "num_attention_heads": 2, - "num_hidden_layers": 2, - "type_vocab_size": 2, - "vocab_size": 9 - } - with tempfile.NamedTemporaryFile(delete=False) as bert_config_writer: - bert_config_writer.write(json.dumps(bert_config).encode("utf-8")) - # Note that in practice the bert_config_file and doc_bert_config_file can - # be different. - bert_config_file = bert_config_writer.name - doc_bert_config_file = bert_config_writer.name - - # Construct a dual_encoder_config for testing purpose. - dual_encoder_config = experiment_config_pb2.DualEncoderConfig() - encoder_config = dual_encoder_config.encoder_config - encoder_config.model_name = constants.MODEL_NAME_SMITH_DUAL_ENCODER - encoder_config.max_seq_length = 6 - encoder_config.max_sent_length_by_word = 2 - encoder_config.max_doc_length_by_sentence = 3 - encoder_config.loop_sent_number_per_doc = 3 - encoder_config.max_predictions_per_seq = 1 - encoder_config.use_masked_sentence_lm_loss = True - encoder_config.max_masked_sent_per_doc = 2 - encoder_config.bert_config_file = bert_config_file - encoder_config.doc_bert_config_file = doc_bert_config_file - # Set train_batch_size and eval_batch_size for the batch_size_static used - # in the build_smith_ca function. - train_eval_config = dual_encoder_config.train_eval_config - train_eval_config.train_batch_size = 1 - train_eval_config.eval_batch_size = 1 - self.dual_encoder_config = dual_encoder_config - self.train_mode = constants.TRAIN_MODE_JOINT_TRAIN - - self.model_fn = modeling.model_fn_builder( - dual_encoder_config=dual_encoder_config, - train_mode=self.train_mode, - learning_rate=1e-5, - num_train_steps=100000, - num_warmup_steps=500, - use_tpu=False, - use_one_hot_embeddings=False, - debugging=True) - - self.features = { - "input_ids_1": tf.constant([[0, 5, 5, 7, 1, 1]], dtype=tf.int32), - "input_mask_1": tf.constant([[1, 1, 1, 1, 1, 1]], dtype=tf.int32), - "masked_lm_positions_1": tf.constant([[3]], dtype=tf.int32), - "masked_lm_ids_1": tf.constant([[5]], dtype=tf.int32), - "masked_lm_weights_1": tf.constant([[1.0]], dtype=tf.float32), - "input_ids_2": tf.constant([[0, 4, 4, 7, 1, 1]], dtype=tf.int32), - "input_mask_2": tf.constant([[1, 1, 1, 1, 1, 1]], dtype=tf.int32), - "masked_lm_positions_2": tf.constant([[3]], dtype=tf.int32), - "masked_lm_ids_2": tf.constant([[4]], dtype=tf.int32), - "masked_lm_weights_2": tf.constant([[1.0]], dtype=tf.float32), - "documents_match_labels": tf.constant([[1.0]], dtype=tf.float32) - } - - def test_build_smith_dual_encoder(self): - masked_lm_positions_1 = tf.constant([[0, 2, 5]], dtype=tf.int32) - masked_lm_ids_1 = tf.constant([[0, 5, 1]], dtype=tf.int32) - masked_lm_weights_1 = tf.constant([[1.0, 1.0, 1.0]], dtype=tf.float32) - masked_lm_positions_2 = tf.constant([[0, 2, 5]], dtype=tf.int32) - masked_lm_ids_2 = tf.constant([[0, 5, 1]], dtype=tf.int32) - masked_lm_weights_2 = tf.constant([[1.0, 1.0, 1.0]], dtype=tf.float32) - - (masked_lm_loss_1, _, - masked_lm_example_loss_1, _, - _, _, - masked_sent_lm_loss_1, _, - _, _, - _, _, sequence_encoding_1, - _, _, - _, _, - _, siamese_loss, siamese_example_loss, - siamese_logits) = \ - modeling.build_smith_dual_encoder( - dual_encoder_config=self.dual_encoder_config, - train_mode=self.train_mode, - is_training=True, - input_ids_1=self.features["input_ids_1"], - input_mask_1=self.features["input_mask_1"], - masked_lm_positions_1=masked_lm_positions_1, - masked_lm_ids_1=masked_lm_ids_1, - masked_lm_weights_1=masked_lm_weights_1, - input_ids_2=self.features["input_ids_2"], - input_mask_2=self.features["input_mask_2"], - masked_lm_positions_2=masked_lm_positions_2, - masked_lm_ids_2=masked_lm_ids_2, - masked_lm_weights_2=masked_lm_weights_2, - use_one_hot_embeddings=False, - documents_match_labels=self.features["documents_match_labels"]) - with tf.Session(config=npu_config_proto()) as sess: - sess.run([tf.global_variables_initializer()]) - result_numpy = sess.run([ - masked_lm_loss_1, masked_lm_example_loss_1, sequence_encoding_1, - siamese_loss, siamese_example_loss, siamese_logits, - masked_sent_lm_loss_1 - ]) - self.assertEqual(result_numpy[0].shape, ()) - self.assertDTypeEqual(result_numpy[0], np.float32) - - self.assertEqual(result_numpy[1].shape, (1, 3)) - self.assertDTypeEqual(result_numpy[1], np.float32) - - self.assertEqual(result_numpy[2].shape, (1, 16)) - self.assertDTypeEqual(result_numpy[2], np.float32) - - self.assertEqual(result_numpy[3].shape, ()) - self.assertDTypeEqual(result_numpy[3], np.float32) - - self.assertEqual(result_numpy[4].shape, (1,)) - self.assertDTypeEqual(result_numpy[4], np.float32) - - self.assertEqual(result_numpy[5].shape, (1,)) - self.assertDTypeEqual(result_numpy[5], np.float32) - - self.assertEqual(result_numpy[6].shape, ()) - self.assertDTypeEqual(result_numpy[6], np.float32) - - def test_model_fn_builder_train(self): - self.model_fn( - features=self.features, - labels=None, - mode=tf.estimator.ModeKeys.TRAIN, - params=None) - - def test_model_fn_builder_eval(self): - self.model_fn( - features=self.features, - labels=None, - mode=tf.estimator.ModeKeys.EVAL, - params=None) - - def test_model_fn_builder_predict(self): - self.model_fn( - features=self.features, - labels=None, - mode=tf.estimator.ModeKeys.PREDICT, - params=None) - - -if __name__ == "__main__": - tf.test.main() - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith.py deleted file mode 100644 index 898fcd27267e88142573c2f02ecb0553ea2642d3..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith.py +++ /dev/null @@ -1,589 +0,0 @@ -# coding=utf-8 -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Library to preprocess text data into SMITH dual encoder model inputs.""" -from npu_bridge.npu_init import * -import collections -import random -import nltk -import tensorflow.compat.v1 as tf -import tqdm -from smith import utils -from smith import wiki_doc_pair_pb2 -from smith.bert import tokenization - -flags = tf.flags - -FLAGS = flags.FLAGS - -flags.DEFINE_string("input_file", None, "Input data path.") - -flags.DEFINE_string( - "output_file", None, - "Output TF examples (or comma-separated list of files) in TFRecord " - "files.") - -flags.DEFINE_string("vocab_file", None, - "The vocabulary file that the SMITH model was trained on.") - -flags.DEFINE_bool( - "do_lower_case", True, - "Whether to lower case the input text. Should be True for uncased " - "models and False for cased models.") - -flags.DEFINE_bool("add_masks_lm", True, - "If true, add masks for word prediction LM pre-training.") - -flags.DEFINE_integer( - "max_sent_length_by_word", 32, "The maximum length of a sentence by tokens." - "A sentence will be cut off if longer than this length, and will be padded " - "if shorter than it. The sentence can also be a sentence block.") - -flags.DEFINE_integer( - "max_doc_length_by_sentence", 64, - "The maximum length of a document by sentences. A " - "document will be cut off if longer than this length, and" - "will be padded if shorter than it.") - -flags.DEFINE_bool( - "greedy_sentence_filling", True, - "If true, apply the greedy sentence filling trick to reduce the " - "number of padded tokens.") - -flags.DEFINE_integer("max_predictions_per_seq", 5, - "Maximum number of masked LM predictions per sequence.") - -flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") - -flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") - - -class TrainingInstance(object): - """A single training instance (sentence pair as dual encoder model inputs).""" - - def __init__(self, - tokens_1, - segment_ids_1, - masked_lm_positions_1, - masked_lm_labels_1, - input_mask_1, - masked_lm_weights_1, - tokens_2, - segment_ids_2, - masked_lm_positions_2, - masked_lm_labels_2, - input_mask_2, - masked_lm_weights_2, - instance_id, - documents_match_labels=-1.0): - self.tokens_1 = tokens_1 - self.segment_ids_1 = segment_ids_1 - self.masked_lm_positions_1 = masked_lm_positions_1 - self.masked_lm_labels_1 = masked_lm_labels_1 - self.input_mask_1 = input_mask_1 - self.masked_lm_weights_1 = masked_lm_weights_1 - self.tokens_2 = tokens_2 - self.segment_ids_2 = segment_ids_2 - self.masked_lm_positions_2 = masked_lm_positions_2 - self.masked_lm_labels_2 = masked_lm_labels_2 - self.input_mask_2 = input_mask_2 - self.masked_lm_weights_2 = masked_lm_weights_2 - self.instance_id = instance_id - self.documents_match_labels = documents_match_labels - - def __str__(self): - s = "" - s += "instance_id: %s\n" % self.instance_id - s += "documents_match_labels: %s\n" % (str(self.documents_match_labels)) - s += "tokens_1: %s\n" % (" ".join( - [tokenization.printable_text(x) for x in self.tokens_1])) - s += "segment_ids_1: %s\n" % (" ".join([str(x) for x in self.segment_ids_1 - ])) - s += "masked_lm_positions_1: %s\n" % (" ".join( - [str(x) for x in self.masked_lm_positions_1])) - s += "masked_lm_labels_1: %s\n" % (" ".join( - [tokenization.printable_text(x) for x in self.masked_lm_labels_1])) - s += "input_mask_1: %s\n" % (" ".join([str(x) for x in self.input_mask_1])) - s += "masked_lm_weights_1: %s\n" % (" ".join( - [str(x) for x in self.masked_lm_weights_1])) - s += "tokens_2: %s\n" % (" ".join( - [tokenization.printable_text(x) for x in self.tokens_2])) - s += "segment_ids_2: %s\n" % (" ".join([str(x) for x in self.segment_ids_2 - ])) - s += "masked_lm_positions_2: %s\n" % (" ".join( - [str(x) for x in self.masked_lm_positions_2])) - s += "masked_lm_labels_2: %s\n" % (" ".join( - [tokenization.printable_text(x) for x in self.masked_lm_labels_2])) - s += "input_mask_2: %s\n" % (" ".join([str(x) for x in self.input_mask_2])) - s += "masked_lm_weights_2: %s\n" % (" ".join( - [str(x) for x in self.masked_lm_weights_2])) - s += "\n" - return s - - def __repr__(self): - return self.__str__() - - -def add_features_for_one_doc(features, tokens, segment_ids, input_mask, - masked_lm_positions, masked_lm_labels, - masked_lm_weights, tokenizer, doc_index): - """Add features for one document in a WikiDocPair example.""" - input_ids = tokenizer.convert_tokens_to_ids(tokens) - features["input_ids_" + doc_index] = utils.create_int_feature(input_ids) - features["input_mask_" + doc_index] = utils.create_int_feature(input_mask) - features["segment_ids_" + doc_index] = utils.create_int_feature(segment_ids) - - if masked_lm_labels: - masked_lm_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels) - features["masked_lm_positions_" + - doc_index] = utils.create_int_feature(masked_lm_positions) - features["masked_lm_ids_" + - doc_index] = utils.create_int_feature(masked_lm_ids) - features["masked_lm_weights_" + - doc_index] = utils.create_float_feature(masked_lm_weights) - - -def write_instance_to_example_files(instances, tokenizer, output_files): - """Create TF example files from `TrainingInstance`s.""" - writers = [] - for output_file in output_files: - writers.append(tf.python_io.TFRecordWriter(output_file)) - writer_index = 0 - total_written = 0 - for (inst_index, instance) in enumerate(instances): - features = collections.OrderedDict() - add_features_for_one_doc( - features=features, - tokens=instance.tokens_1, - segment_ids=instance.segment_ids_1, - input_mask=instance.input_mask_1, - masked_lm_positions=instance.masked_lm_positions_1, - masked_lm_labels=instance.masked_lm_labels_1, - masked_lm_weights=instance.masked_lm_weights_1, - tokenizer=tokenizer, - doc_index="1") - add_features_for_one_doc( - features=features, - tokens=instance.tokens_2, - segment_ids=instance.segment_ids_2, - input_mask=instance.input_mask_2, - masked_lm_positions=instance.masked_lm_positions_2, - masked_lm_labels=instance.masked_lm_labels_2, - masked_lm_weights=instance.masked_lm_weights_2, - tokenizer=tokenizer, - doc_index="2") - # Adds fields on more content/id information of the current example. - features["instance_id"] = utils.create_bytes_feature( - [bytes(instance.instance_id, "utf-8")]) - features["tokens_1"] = utils.create_bytes_feature( - [bytes(t, "utf-8") for t in instance.tokens_1]) - features["tokens_2"] = utils.create_bytes_feature( - [bytes(t, "utf-8") for t in instance.tokens_2]) - # Adds the documents matching labels. - features["documents_match_labels"] = utils.create_float_feature( - [float(instance.documents_match_labels)]) - tf_example = tf.train.Example(features=tf.train.Features(feature=features)) - - writers[writer_index].write(tf_example.SerializeToString()) - writer_index = (writer_index + 1) % len(writers) - - total_written += 1 - - if inst_index < 5: - tf.logging.info("*** Example ***") - tf.logging.info( - "tokens_1: %s" % - " ".join([tokenization.printable_text(x) for x in instance.tokens_1])) - tf.logging.info( - "tokens_2: %s" % - " ".join([tokenization.printable_text(x) for x in instance.tokens_2])) - - for feature_name in features.keys(): - feature = features[feature_name] - values = [] - if feature.int64_list.value: - values = feature.int64_list.value - elif feature.float_list.value: - values = feature.float_list.value - elif feature.bytes_list.value: - values = feature.bytes_list.value - tf.logging.info("%s: %s" % - (feature_name, " ".join([str(x) for x in values]))) - - for writer in writers: - writer.close() - - tf.logging.info("Wrote %d total instances", total_written) - - -def get_smith_model_tokens(input_text, tokenizer, sent_token_counter): - """Generate tokens given an input text for the SMITH model.""" - res_tokens = [] - for sent in nltk.tokenize.sent_tokenize(input_text): - # The returned res_tokens is a 2D list to maintain the sentence boundary - # information. We removed all the empty tokens in this step. - if not sent: - continue - tokens = [w for w in tokenizer.tokenize(sent) if w] - sent_token_counter[0] += 1 # Track number of sentences. - sent_token_counter[1] += len(tokens) # Track number of tokens. - res_tokens.append(tokens) - return (res_tokens, sent_token_counter) - - -def create_training_instances_wiki_doc_pair( - input_file, tokenizer, max_sent_length_by_word, max_doc_length_by_sentence, - masked_lm_prob, max_predictions_per_seq, rng): - """Create `TrainingInstance`s from WikiDocPair proto data.""" - # The input data is in the WikiDocPair proto format in tfrecord. - # Add by:TC - wiki_doc_pair = wiki_doc_pair_pb2.WikiDocPair() - instances = [] - # Add some counters to track some data statistics. - sent_token_counter = [0, 0] - for example in tqdm.tqdm(tf.python_io.tf_record_iterator(input_file)): - doc_pair = wiki_doc_pair.FromString(example) - # If model_name = smith_dual_encoder, we firstly use a sentence tokenizer - # to split doc_one/doc_two texts into different sentences and use [SEN] to - # label the sentence boundary information. So in the masking and padding - # step, we know the boundary between different sentences and we can do the - # masking and padding according to the actual length of each sentence. - doc_one_text = " \n\n\n\n\n\n ".join( - [a.text for a in doc_pair.doc_one.section_contents]) - doc_two_text = " \n\n\n\n\n\n ".join( - [a.text for a in doc_pair.doc_two.section_contents]) - doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip() - doc_two_text = tokenization.convert_to_unicode(doc_two_text).strip() - doc_one_tokens, sent_token_counter = get_smith_model_tokens( - doc_one_text, tokenizer, sent_token_counter) - doc_two_tokens, sent_token_counter = get_smith_model_tokens( - doc_two_text, tokenizer, sent_token_counter) - # Skip the document pairs if any document is empty. - if not doc_one_tokens or not doc_two_tokens: - continue - vocab_words = list(tokenizer.vocab.keys()) - instance_id = doc_pair.id - if doc_pair.human_label_for_classification: - doc_match_label = doc_pair.human_label_for_classification - else: - # Set the label as 0.0 if there are no available labels. - doc_match_label = 0.0 - instances.append( - create_instance_from_wiki_doc_pair( - instance_id, doc_match_label, doc_one_tokens, doc_two_tokens, - max_sent_length_by_word, max_doc_length_by_sentence, masked_lm_prob, - max_predictions_per_seq, vocab_words, rng)) - rng.shuffle(instances) - return (instances, sent_token_counter) - - -def create_instance_from_wiki_doc_pair(instance_id, doc_match_label, - doc_one_tokens, doc_two_tokens, - max_sent_length_by_word, - max_doc_length_by_sentence, - masked_lm_prob, max_predictions_per_seq, - vocab_words, rng): - """Creates `TrainingInstance`s for a WikiDocPair input data.""" - tokens_segment_ids_masks = GetTokensSegmentIdsMasks().get_tokens_segment_ids_masks - - tokens_segment_ids_masks_res_1 = \ - tokens_segment_ids_masks(max_sent_length_by_word, max_doc_length_by_sentence, doc_one_tokens, masked_lm_prob, - max_predictions_per_seq, vocab_words, rng) - - tokens_1, segment_ids_1, masked_lm_positions_1, masked_lm_labels_1, input_mask_1, masked_lm_weights_1 = \ - tokens_segment_ids_masks_res_1.tokens_doc, tokens_segment_ids_masks_res_1.segment_ids_doc, \ - tokens_segment_ids_masks_res_1.masked_lm_positions_doc, tokens_segment_ids_masks_res_1.masked_lm_labels_doc, \ - tokens_segment_ids_masks_res_1.input_mask_doc, tokens_segment_ids_masks_res_1.masked_lm_weights_doc - - tokens_segment_ids_masks_res_2= \ - tokens_segment_ids_masks(max_sent_length_by_word, max_doc_length_by_sentence, doc_two_tokens, masked_lm_prob, - max_predictions_per_seq, vocab_words, rng) - - tokens_2, segment_ids_2, masked_lm_positions_2, masked_lm_labels_2, input_mask_2, masked_lm_weights_2 = \ - tokens_segment_ids_masks_res_2.tokens_doc, tokens_segment_ids_masks_res_2.segment_ids_doc, \ - tokens_segment_ids_masks_res_2.masked_lm_positions_doc, tokens_segment_ids_masks_res_2.masked_lm_labels_doc, \ - tokens_segment_ids_masks_res_2.input_mask_doc, tokens_segment_ids_masks_res_2.masked_lm_weights_doc - - instance = TrainingInstance( - tokens_1=tokens_1, - segment_ids_1=segment_ids_1, - masked_lm_positions_1=masked_lm_positions_1, - masked_lm_labels_1=masked_lm_labels_1, - input_mask_1=input_mask_1, - masked_lm_weights_1=masked_lm_weights_1, - tokens_2=tokens_2, - segment_ids_2=segment_ids_2, - masked_lm_positions_2=masked_lm_positions_2, - masked_lm_labels_2=masked_lm_labels_2, - input_mask_2=input_mask_2, - masked_lm_weights_2=masked_lm_weights_2, - instance_id=instance_id, - documents_match_labels=doc_match_label) - return instance - - -class GetTokensSegmentIdsMasks(object): - - def __init__(self, tokens_doc=None, segment_ids_doc=None, masked_lm_positions_doc=None, - masked_lm_labels_doc=None, input_mask_doc=None, masked_lm_weights_doc=None): - self.tokens_doc = tokens_doc - self.segment_ids_doc = segment_ids_doc - self.masked_lm_positions_doc = masked_lm_positions_doc - self.masked_lm_labels_doc = masked_lm_labels_doc - self.input_mask_doc = input_mask_doc - self.masked_lm_weights_doc = masked_lm_weights_doc - - def get_tokens_segment_ids_masks(self, max_sent_length_by_word, - max_doc_length_by_sentence, doc_one_tokens, - masked_lm_prob, max_predictions_per_seq, - vocab_words, rng): - """Get the tokens, segment ids and masks of an input sequence.""" - # The format of tokens for SMITH dual encoder models is like: - # [CLS] block1_token1 block1_token2 block1_token3 ... [SEP] [SEP] [PAD] ... - # [CLS] block2_token1 block2_token2 block2_token3 ... [SEP] [SEP] [PAD] ... - # [CLS] block3_token1 block3_token2 block3_token3 ... [SEP] [SEP] [PAD] ... - # If max_sent_length_by_word is large, then there will be many padded - # words in the sentence. Here we added an optional "greedy sentence filling" - # trick in order to reduce the number of padded words and maintain all - # content in the document. We allow a "sentence" block to contain more than - # one natural sentence and try to fill as many as sentences into the - # "sentence" block. If a sentence will be cut off and the current sentence - # block is not empty, we will put the sentence into the next "sentence" block. - # According to ALBERT paper and RoBERTa paper, a segment is usually comprised - # of more than one natural sentence, which has been shown to benefit - # performance. doc_one_tokens is a 2D list which contains the sentence - # boundary information. - sentence_num = len(doc_one_tokens) - # sent_block_token_list is a 2D list to maintain sentence block tokens. - sent_block_token_list = [] - natural_sentence_index = -1 - while natural_sentence_index + 1 < sentence_num: - natural_sentence_index += 1 - sent_tokens = doc_one_tokens[natural_sentence_index] - if not sent_tokens: - continue - if FLAGS.greedy_sentence_filling: - cur_sent_block_length = 0 - cur_sent_block = [] - # Fill as many senteces as possible in the current sentence block in a - # greedy way. - while natural_sentence_index < sentence_num: - cur_natural_sent_tokens = doc_one_tokens[natural_sentence_index] - if not cur_natural_sent_tokens: - natural_sentence_index += 1 - continue - cur_sent_len = len(cur_natural_sent_tokens) - if ((cur_sent_block_length + cur_sent_len) <= - (max_sent_length_by_word - 3)) or cur_sent_block_length == 0: - # One exceptional case here is that if the 1st sentence of a sentence - # block is already going across the boundary, then the current - # sentence block will be empty. So when cur_sent_block_length is 0 - # and we meet a natural sentence with length longer than - # (max_sent_length_by_word - 3), we still put this natural sentence - # in the current sentence block. In this case, this long natural - # sentence will be cut off with the final length up to - # (max_sent_length_by_word - 3). - cur_sent_block.extend(cur_natural_sent_tokens) - cur_sent_block_length += cur_sent_len - natural_sentence_index += 1 - else: - # If cur_sent_block_length + cur_sent_len > max_sent_length_by_word-3 - # and the current sentence block is not empty, the sentence which - # goes across the boundary will be put into the next sentence block. - natural_sentence_index -= 1 - break - sent_tokens = cur_sent_block - sent_block_token_list.append(sent_tokens) - if len(sent_block_token_list) >= max_doc_length_by_sentence: - break # Skip more sentence blocks if the document is too long. - # For each sentence block, generate the token sequences, masks and paddings. - tokens_doc = [] - segment_ids_doc = [] - masked_lm_positions_doc = [] - masked_lm_labels_doc = [] - input_mask_doc = [] - masked_lm_weights_doc = [] - for block_index in range(len(sent_block_token_list)): - token_masks_paddings = \ - GetTokenMasksPaddings().get_token_masks_paddings( - sent_block_token_list[block_index], - max_sent_length_by_word, - masked_lm_prob, - max_predictions_per_seq, - vocab_words, - rng, - block_index) - - tokens_block, segment_ids_block, masked_lm_positions_block, masked_lm_labels_block, \ - input_mask_block, masked_lm_weights_block = \ - token_masks_paddings.tokens, token_masks_paddings.segment_ids, token_masks_paddings.masked_lm_positions, \ - token_masks_paddings.masked_lm_labels, token_masks_paddings.input_mask, \ - token_masks_paddings.masked_lm_weights - - tokens_doc.extend(tokens_block) - segment_ids_doc.extend(segment_ids_block) - masked_lm_positions_doc.extend(masked_lm_positions_block) - masked_lm_labels_doc.extend(masked_lm_labels_block) - input_mask_doc.extend(input_mask_block) - masked_lm_weights_doc.extend(masked_lm_weights_block) - - # Pad sentence blocks if the actual number of sentence blocks is less than - # max_doc_length_by_sentence. - sentence_block_index = len(sent_block_token_list) - while sentence_block_index < max_doc_length_by_sentence: - for _ in range(max_sent_length_by_word): - tokens_doc.append("[PAD]") - segment_ids_doc.append(0) - input_mask_doc.append(0) - for _ in range(max_predictions_per_seq): - masked_lm_positions_doc.append(0) - masked_lm_labels_doc.append("[PAD]") - masked_lm_weights_doc.append(0.0) - sentence_block_index += 1 - assert len(tokens_doc) == max_sent_length_by_word * max_doc_length_by_sentence - assert len(masked_lm_labels_doc - ) == max_predictions_per_seq * max_doc_length_by_sentence - - return GetTokensSegmentIdsMasks(tokens_doc, segment_ids_doc, masked_lm_positions_doc, - masked_lm_labels_doc, input_mask_doc, masked_lm_weights_doc) - - -class GetTokenMasksPaddings(object): - - def __init__(self, tokens=None, segment_ids=None, masked_lm_positions=None, masked_lm_labels=None, - input_mask=None, masked_lm_weights=None): - self.tokens = tokens - self.segment_ids = segment_ids - self.masked_lm_positions = masked_lm_positions - self.masked_lm_labels = masked_lm_labels - self.input_mask = input_mask - self.masked_lm_weights = masked_lm_weights - - def get_token_masks_paddings(self, block_tokens, max_sent_length_by_word, - masked_lm_prob, max_predictions_per_seq, - vocab_words, rng, block_index): - """Generates tokens, masks and paddings for the input block tokens.""" - # Account for [CLS], [SEP], [SEP] - max_num_tokens = max_sent_length_by_word - 3 - # Truncates the sequence if sequence length is longer than max_num_tokens. - tokens = [] - segment_ids = [] - if len(block_tokens) > max_num_tokens: - block_tokens = block_tokens[0:max_num_tokens] - tokens_a = block_tokens - tokens.append("[CLS]") - segment_ids.append(0) - for token in tokens_a: - tokens.append(token) - segment_ids.append(0) - tokens.append("[SEP]") - segment_ids.append(0) - tokens.append("[SEP]") - segment_ids.append(0) - masked_lm_positions = [] - masked_lm_labels = [] - masked_lm_weights = [] - if max_predictions_per_seq > 0: - (tokens, masked_lm_positions, - masked_lm_labels) = utils.create_masked_lm_predictions( - tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) - # Add [PAD] to tokens and masked LM related lists. - input_mask = [1] * len(tokens) - while len(tokens) < max_sent_length_by_word: - tokens.append("[PAD]") - input_mask.append(0) - segment_ids.append(0) - - assert len(tokens) == max_sent_length_by_word - assert len(input_mask) == max_sent_length_by_word - assert len(segment_ids) == max_sent_length_by_word - - if max_predictions_per_seq > 0: - # Transfer local positions in masked_lm_positions to global positions in the - # whole document to be consistent with the model training pipeline. - masked_lm_positions = [ - (i + max_sent_length_by_word * block_index) for i in masked_lm_positions - ] - masked_lm_weights = [1.0] * len(masked_lm_labels) - - while len(masked_lm_positions) < max_predictions_per_seq: - masked_lm_positions.append(0) - masked_lm_labels.append("[PAD]") - masked_lm_weights.append(0.0) - - return GetTokenMasksPaddings(tokens, segment_ids, masked_lm_positions, masked_lm_labels, - input_mask, masked_lm_weights) - - -def main(_): - tf.logging.set_verbosity(tf.logging.INFO) - - tokenizer = tokenization.FullTokenizer( - vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) - - input_files = [] - for input_pattern in FLAGS.input_file.split(","): - input_files.extend(tf.gfile.Glob(input_pattern)) - - tf.logging.info("*** Reading from input files ***") - for input_file in input_files: - tf.logging.info(" %s", input_file) - rng = random.Random(FLAGS.random_seed) - # Creates training instances. - max_predictions_per_seq = FLAGS.max_predictions_per_seq if FLAGS.add_masks_lm else 0 - masked_lm_prob = FLAGS.masked_lm_prob if FLAGS.add_masks_lm else 0 - instances, sent_token_counter = create_training_instances_wiki_doc_pair( - input_file=FLAGS.input_file, - tokenizer=tokenizer, - max_sent_length_by_word=FLAGS.max_sent_length_by_word, - max_doc_length_by_sentence=FLAGS.max_doc_length_by_sentence, - masked_lm_prob=masked_lm_prob, - max_predictions_per_seq=max_predictions_per_seq, - rng=rng) - - output_files = FLAGS.output_file.split(",") - tf.logging.info("*** Writing to output files ***") - for output_file in output_files: - tf.logging.info(" %s", output_file) - - # Transfers training instances into tensorflow examples and write the results. - write_instance_to_example_files(instances, tokenizer, output_files) - - # Finally outputs some data statistics. - tf.logging.info("sent_count, token_count, doc_pair_count: %d %d %d", - sent_token_counter[0], sent_token_counter[1], len(instances)) - - -if __name__ == "__main__": - flags.mark_flag_as_required("input_file") - flags.mark_flag_as_required("output_file") - flags.mark_flag_as_required("vocab_file") - tf.app.run() - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test.py deleted file mode 100644 index f4b828c75d5077d9bb36731beca7160ceda65e7e..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/preprocessing_smith_test.py +++ /dev/null @@ -1,108 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -from npu_bridge.npu_init import * -import random -import tempfile - -from absl import flags -import tensorflow.compat.v1 as tf - -from smith import preprocessing_smith -from smith.bert import tokenization - -FLAGS = flags.FLAGS - - -class PreprocessingSmithTest(tf.test.TestCase): - - def setUp(self): - super(PreprocessingSmithTest, self).setUp() - doc_one_text = ( - "I am in Dominick's for my dinner. OK, no problem. I am " - "in Dominick's for my dinner which is the best dinner I have " - "in my whole life.") - doc_one_text = tokenization.convert_to_unicode(doc_one_text).strip() - vocab_tokens = [ - "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "i", "am", "in", "for", - "my", "dinner", "ok", "no", "problem", "which", "is", "the", "be", - "##s", "##t", "," - ] - with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: - vocab_writer.write("".join([x + "\n" for x in vocab_tokens - ]).encode("utf-8")) - self.vocab_file = vocab_writer.name - self.tokenizer = tokenization.FullTokenizer( - vocab_file=self.vocab_file, do_lower_case=True) - self.vocab_words = list(self.tokenizer.vocab.keys()) - self.rng = random.Random(12345) - self.doc_one_tokens, _ = preprocessing_smith.get_smith_model_tokens( - doc_one_text, self.tokenizer, [0, 0]) - self.max_sent_length_by_word = 20 - self.max_doc_length_by_sentence = 3 - self.greedy_sentence_filling = True - self.max_predictions_per_seq = 0 - self.masked_lm_prob = 0 - - def test_get_tokens_segment_ids_masks(self): - (tokens_1, segment_ids_1, _, _, input_mask_1, _) = \ - preprocessing_smith.get_tokens_segment_ids_masks( - max_sent_length_by_word=self.max_sent_length_by_word, - max_doc_length_by_sentence=self.max_doc_length_by_sentence, - doc_one_tokens=self.doc_one_tokens, - masked_lm_prob=self.masked_lm_prob, - max_predictions_per_seq=self.max_predictions_per_seq, - vocab_words=self.vocab_words, - rng=self.rng) - self.assertEqual(tokens_1, [ - "[CLS]", "i", "am", "in", "[UNK]", "[UNK]", "[UNK]", "for", "my", - "dinner", "[UNK]", "ok", ",", "no", "problem", "[UNK]", "[SEP]", - "[SEP]", "[PAD]", "[PAD]", "[CLS]", "i", "am", "in", "[UNK]", "[UNK]", - "[UNK]", "for", "my", "dinner", "which", "is", "the", "be", "##s", - "##t", "dinner", "i", "[SEP]", "[SEP]", "[PAD]", "[PAD]", "[PAD]", - "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", - "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", "[PAD]", - "[PAD]" - ]) - self.assertEqual(segment_ids_1, [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - ]) - self.assertEqual(input_mask_1, [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - ]) - - -if __name__ == "__main__": - tf.test.main() - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/requirements.txt b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/requirements.txt deleted file mode 100644 index 92734efa4f39430f33a828cd8f52ea51796c2db2..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -# Install protoc to use protobuf refer to the README file -# Run pip install --upgrade pip if tensorflow 1.15 cannot be found -# tensorflow==1.15.2 # CPU Version of TensorFlow; The Python version should be <=3.7 -tensorflow-gpu==1.15 # GPU version of TensorFlow; The Python version should be <=3.7 -tf_slim==1.1.0 -nltk>=3.5 -tqdm>=4.50.1 -numpy>=1.13.3 diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/run_smith.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/run_smith.py deleted file mode 100644 index 280dc81035fb9419f7ca2090bdf8f420501e52cd..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/run_smith.py +++ /dev/null @@ -1,378 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Dual encoder SMITH models.""" -from npu_bridge.npu_init import * -import json -import os -from absl import app -from absl import flags -import tensorflow.compat.v1 as tf -from smith import constants -from smith import experiment_config_pb2 -from smith import input_fns -from smith import modeling as smith_modeling -from smith import utils - -flags.DEFINE_string("dual_encoder_config_file", None, - "The proto config file for dual encoder SMITH models.") - -flags.DEFINE_string( - "output_dir", None, - "The output directory where the model checkpoints will be written.") - -flags.DEFINE_enum( - "train_mode", None, ["finetune", "pretrain", "joint_train"], - "Whether it is joint_train, pretrain or finetune. The difference is " - "about total_loss calculation and input files for eval and training.") - -flags.DEFINE_enum( - "schedule", None, ["train", "continuous_eval", "predict", "export"], - "The run schedule which can be any one of train, continuous_eval, " - "predict or export.") - -flags.DEFINE_bool("debugging", False, - "Print out some information for debugging.") - -flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") - -flags.DEFINE_integer("num_train_steps", None, "Number of training steps.") - -flags.DEFINE_integer("num_warmup_steps", None, "Number of warmup steps.") - -flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") - -flags.DEFINE_string( - "tpu_name", None, - "The Cloud TPU to use for training. This should be either the name " - "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " - "url.") - -flags.DEFINE_string( - "tpu_zone", None, - "[Optional] GCE zone where the Cloud TPU is located in. If not " - "specified, we will attempt to automatically detect the GCE project from " - "metadata.") - -flags.DEFINE_string( - "gcp_project", None, - "[Optional] Project name for the Cloud TPU-enabled project. If not " - "specified, we will attempt to automatically detect the GCE project from " - "metadata.") - -flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") - -flags.DEFINE_integer( - "num_tpu_cores", 8, - "Only used if `use_tpu` is True. Total number of TPU cores to use.") - -FLAGS = flags.FLAGS - - -class _LogSessionRunHook(tf.train.SessionRunHook): - def __init__(self, display_every=1): - self.display_every = display_every - - def before_run(self, run_context): - - return tf.train.SessionRunArgs( - #fetches=[tf.train.get_global_step(), 'total_loss:0', 'loss_scale:0', 'overflow_status_reduce_all:0']) - fetches=[tf.train.get_global_step(), 'total_loss:0']) - - def after_run(self, run_context, run_values): - - # global_step, total_loss, loss_scaler, isFinite = run_values.results - global_step, total_loss = run_values.results - print('###Test print### Step = %6i Loss = %9.6f ' % (global_step, total_loss), flush=True) - -training_hooks = [] -training_hooks.append(_LogSessionRunHook(1)) - - -def main(_): - tf.logging.set_verbosity(tf.logging.INFO) - train_mode = FLAGS.train_mode - ############################################################################ - # Load the dual_encoder_config_file file. - ############################################################################ - if tf.gfile.Exists(FLAGS.dual_encoder_config_file): - exp_config = utils.load_config_from_file( - FLAGS.dual_encoder_config_file, - experiment_config_pb2.DualEncoderConfig() - ) - else: - raise ValueError("dual_encoder_config: {} not found!".format( - FLAGS.dual_encoder_config_file)) - tf.logging.info(">>>> final dual_encoder_config:\n {}".format(exp_config)) - tf.gfile.MakeDirs(FLAGS.output_dir) - - ############################################################################ - # Save/copy the configuration file. - ############################################################################ - configs_dir = os.path.join(FLAGS.output_dir, "configs") - tf.gfile.MakeDirs(configs_dir) - tf.gfile.MakeDirs(FLAGS.output_dir) - with tf.gfile.Open( - os.path.join(configs_dir, "dual_encoder_config.pbtxt"), "w") as fout: - print(exp_config, file=fout) - - # Write bert_config.json and doc_bert_config.json. - tf.gfile.Copy( - exp_config.encoder_config.bert_config_file, - os.path.join(configs_dir, "bert_config.json"), - overwrite=True) - tf.gfile.Copy( - exp_config.encoder_config.doc_bert_config_file, - os.path.join(configs_dir, "doc_bert_config.json"), - overwrite=True) - - # Write vocab file(s). - tf.gfile.Copy( - exp_config.encoder_config.vocab_file, - os.path.join(configs_dir, "vocab.txt"), - overwrite=True) - - # Save other important parameters as a json file. - hparams = { - "dual_encoder_config_file": FLAGS.dual_encoder_config_file, - "output_dir": FLAGS.output_dir, - "schedule": FLAGS.schedule, - "debugging": FLAGS.debugging, - "learning_rate": FLAGS.learning_rate, - "num_warmup_steps": FLAGS.num_warmup_steps, - "num_train_steps": FLAGS.num_train_steps, - "num_tpu_cores": FLAGS.num_tpu_cores - } - with tf.gfile.Open(os.path.join(configs_dir, "hparams.json"), "w") as fout: - json.dump(hparams, fout) - tf.logging.info(">>>> saved hparams.json:\n {}".format(hparams)) - - ############################################################################ - # Run the train/eval/predict/export process based on the schedule. - ############################################################################ - max_seq_length_actual, max_predictions_per_seq_actual = \ - utils.get_actual_max_seq_len(exp_config.encoder_config.model_name, - exp_config.encoder_config.max_doc_length_by_sentence, - exp_config.encoder_config.max_sent_length_by_word, - exp_config.encoder_config.max_predictions_per_seq) - - # Prepare input for train and eval. - input_files = [] - for input_pattern in exp_config.train_eval_config.input_file_for_train.split(","): - input_files.extend(tf.gfile.Glob(input_pattern)) - input_file_num = 0 - tf.logging.info("*** Input Files ***") - for input_file in input_files: - tf.logging.info(" %s" % input_file) - input_file_num += 1 - if input_file_num > 10: - break - tf.logging.info("train input_files[0:10]: %s " % "\n".join(input_files[0:10])) - eval_files = [] - if exp_config.train_eval_config.eval_with_eval_data: - eval_files = [] - for input_pattern in exp_config.train_eval_config.input_file_for_eval.split(","): - eval_files.extend(tf.gfile.Glob(input_pattern)) - else: - eval_files = input_files - - input_fn_builder = input_fns.input_fn_builder - # Prepare the input functions. - # Drop_remainder = True during training to maintain fixed batch size. - train_input_fn = input_fn_builder( - input_files=input_files, - is_training=True, - drop_remainder=True, - max_seq_length=max_seq_length_actual, - max_predictions_per_seq=max_predictions_per_seq_actual, - num_cpu_threads=4, - batch_size=exp_config.train_eval_config.train_batch_size, - ) - eval_drop_remainder = True if FLAGS.use_tpu else False - eval_input_fn = input_fn_builder( - input_files=eval_files, - max_seq_length=max_seq_length_actual, - max_predictions_per_seq=max_predictions_per_seq_actual, - is_training=False, - drop_remainder=eval_drop_remainder, - batch_size=exp_config.train_eval_config.eval_batch_size) - predict_input_fn = input_fn_builder( - input_files=eval_files, - max_seq_length=max_seq_length_actual, - max_predictions_per_seq=max_predictions_per_seq_actual, - is_training=False, - drop_remainder=eval_drop_remainder, - batch_size=exp_config.train_eval_config.predict_batch_size, - is_prediction=True) - - # Build and run the model. - tpu_cluster_resolver = None - if FLAGS.use_tpu and FLAGS.tpu_name: - tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( - FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) - - # 此部分注释掉的为调试精度问题时添加的 混合精度、打开/关闭融合规则,以及加入黑名单等操作dump数据的配置 - """ - config = tf.ConfigProto(allow_soft_placement=True) - custom_op = config.graph_options.rewrite_options.custom_optimizers.add() - custom_op.name = "NpuOptimizer" - custom_op.parameter_map["use_off_line"].b = True - custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") - custom_op.parameter_map["fusion_switch_file"].s = tf.compat.as_bytes("/home/test_user06/tc_workspace/tmp/fusion_switch_off.cfg") - #custom_op.parameter_map["enable_dump_debug"].b = True - #custom_op.parameter_map["dump_path"].s = tf.compat.as_bytes('/home/test_user08/tc_workspace/tmp/smith') - #custom_op.parameter_map["dump_debug_mode"].s = tf.compat.as_bytes("all") - - session_config = npu_config_proto(config_proto=config) - ######### Add dump config by:TC ################# - """ - - is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2 - run_config = tf.estimator.tpu.RunConfig( - ##################NPU_modify start############################# - # session_config=session_config, - ##################NPU_modify end############################# - cluster=tpu_cluster_resolver, - master=FLAGS.master, - model_dir=FLAGS.output_dir, - save_checkpoints_steps=exp_config.train_eval_config - .save_checkpoints_steps, - tpu_config=tf.estimator.tpu.TPUConfig( - iterations_per_loop=exp_config.train_eval_config.iterations_per_loop, - num_shards=FLAGS.num_tpu_cores, - per_host_input_for_training=is_per_host), save_summary_steps=0) - - model_fn = smith_modeling.model_fn_builder( - dual_encoder_config=exp_config, - train_mode=FLAGS.train_mode, - learning_rate=FLAGS.learning_rate, - num_train_steps=FLAGS.num_train_steps, - num_warmup_steps=FLAGS.num_warmup_steps, - use_tpu=FLAGS.use_tpu, - use_one_hot_embeddings=FLAGS.use_tpu, - debugging=FLAGS.debugging) - - # If TPU is not available, this will fall back to normal Estimator on CPU - # or GPU. The batch size for eval and predict is the same. - estimator = tf.estimator.tpu.TPUEstimator( - use_tpu=False, - model_fn=model_fn, - config=npu_run_config_init(run_config=run_config), - train_batch_size=exp_config.train_eval_config.train_batch_size, - eval_batch_size=exp_config.train_eval_config.eval_batch_size, - predict_batch_size=exp_config.train_eval_config.predict_batch_size) - - if FLAGS.schedule == "train": - tf.logging.info("***** Running training *****") - tf.logging.info(" Batch size = %d", - exp_config.train_eval_config.train_batch_size) - estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps) - elif FLAGS.schedule == "continuous_eval": - tf.logging.info("***** Running continuous evaluation *****") - tf.logging.info(" Batch size = %d", - exp_config.train_eval_config.eval_batch_size) - # checkpoints_iterator blocks until a new checkpoint appears. - for ckpt in tf.train.checkpoints_iterator(estimator.model_dir): - try: - # Estimator automatically loads and evaluates the latest checkpoint. - result = estimator.evaluate( - input_fn=eval_input_fn, - steps=exp_config.train_eval_config.max_eval_steps) - tf.logging.info("***** Eval results for %s *****", ckpt) - for key, value in result.items(): - tf.logging.info(" %s = %s", key, str(value)) - - except tf.errors.NotFoundError: - # Checkpoint might get garbage collected before the eval can run. - tf.logging.error("Checkpoint path '%s' no longer exists.", ckpt) - elif FLAGS.schedule == "predict": - # Load the model checkpoint and run the prediction process - # to get the predicted scores and labels. The batch size is the same with - # the eval batch size. For more options, refer to - # https://www.tensorflow.org/api_docs/python/tf/compat/v1/estimator/tpu/TPUEstimator#predict - tf.logging.info("***** Running prediction with ckpt {} *****".format( - exp_config.encoder_config.predict_checkpoint)) - tf.logging.info(" Batch size = %d", - exp_config.train_eval_config.eval_batch_size) - output_predict_file = os.path.join(FLAGS.output_dir, - "prediction_results.json") - # Output the prediction results in json format. - pred_res_list = [] - with tf.gfile.GFile(output_predict_file, "w") as writer: - written_line_index = 0 - tf.logging.info("***** Predict results *****") - for result in estimator.predict( - input_fn=predict_input_fn, - checkpoint_path=exp_config.encoder_config.predict_checkpoint, - yield_single_examples=True): - if (exp_config.encoder_config.model_name == - constants.MODEL_NAME_SMITH_DUAL_ENCODER): - pred_item_dict = utils.get_pred_res_list_item_smith_de(result) - else: - raise ValueError("Unsupported model name: %s" % - exp_config.encoder_config.model_name) - pred_res_list.append(pred_item_dict) - written_line_index += 1 - if written_line_index % 500 == 0: - tf.logging.info( - "Current written_line_index: {} *****".format(written_line_index)) - tf.logging.info("***** Finished prediction for %d examples *****", - written_line_index) - tf.logging.info("***** Output prediction results into %s *****", - output_predict_file) - json.dump(pred_res_list, writer) - - elif FLAGS.schedule == "export": - run_config = tf.estimator.RunConfig( - model_dir=FLAGS.output_dir, - save_checkpoints_steps=exp_config.train_eval_config - .save_checkpoints_steps, save_summary_steps=0) - #estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) - export_dir_base = os.path.join(FLAGS.output_dir, "export/") - tf.logging.info( - "***** Export the prediction checkpoint to the folder {} *****".format( - export_dir_base)) - tf.gfile.MakeDirs(export_dir_base) - estimator.export_saved_model( - export_dir_base=export_dir_base, - assets_extra={"vocab.txt": exp_config.encoder_config.vocab_file}, - serving_input_receiver_fn=input_fns.make_serving_input_example_fn( - max_seq_length=max_seq_length_actual, - max_predictions_per_seq=max_predictions_per_seq_actual), - checkpoint_path=exp_config.encoder_config.predict_checkpoint) - else: - raise ValueError("Unsupported schedule : %s" % FLAGS.schedule) - - -if __name__ == "__main__": - flags.mark_flag_as_required("dual_encoder_config_file") - flags.mark_flag_as_required("output_dir") - app.run(main) - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/utils.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/utils.py deleted file mode 100644 index 8e1ba81401b5e2ff9cebe005bf37db4d9c1172d9..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/utils.py +++ /dev/null @@ -1,262 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Google Research Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -"""Helper functions for dual encoder SMITH model.""" -from npu_bridge.npu_init import * - -import collections -from typing import Any, Text - -import tensorflow.compat.v1 as tf - -from google.protobuf import text_format -from smith import constants - - -def transfer_2d_array_to_str(array): - """Transfer a 2D float32 array to a string.""" - str_list = [] - for r in array: - str_list.append(",".join([str(e) for e in r])) - return " ".join(str_list) - - -def get_actual_max_seq_len(model_name, max_doc_length_by_sentence, - max_sent_length_by_word, max_predictions_per_seq): - """Get the actual maximum sequence length. - - Args: - model_name: The name of the model. - max_doc_length_by_sentence: The maximum document length by the number of - sentences. - max_sent_length_by_word: The maximum sentence length by the number of words. - max_predictions_per_seq: The maxinum number of predicted masked tokens in - sequence, which can be useful for the masked LM prediction task. - - Returns: - The actual maximum sequence length and maximum number of masked LM - predictions per sequence. For SMITH model, we need to consider the - maximum number of sentence blocks in a document to compute these - statistics. - - Raises: - ValueError: if the arguments are not usable. - - """ - if model_name == constants.MODEL_NAME_SMITH_DUAL_ENCODER: - max_seq_length_actual = \ - max_doc_length_by_sentence * max_sent_length_by_word - max_predictions_per_seq_actual = \ - max_doc_length_by_sentence * max_predictions_per_seq - else: - raise ValueError("Only the SMITH model is supported: %s" % model_name) - return (max_seq_length_actual, max_predictions_per_seq_actual) - - -def get_export_outputs_prediction_dict_smith_de( - seq_embed_1, seq_embed_2, predicted_score, predicted_class, - documents_match_labels, input_sent_embed_1, input_sent_embed_2, - output_sent_embed_1, output_sent_embed_2): - """Generates export and prediction dict for dual encoder SMITH model.""" - export_outputs = { - tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: - tf.estimator.export.PredictOutput(predicted_score), - "seq_embed_1": - tf.estimator.export.PredictOutput(seq_embed_1), - "seq_embed_2": - tf.estimator.export.PredictOutput(seq_embed_2), - "input_sent_embed_1": - tf.estimator.export.PredictOutput(input_sent_embed_1), - "input_sent_embed_2": - tf.estimator.export.PredictOutput(input_sent_embed_2), - "output_sent_embed_1": - tf.estimator.export.PredictOutput(output_sent_embed_1), - "output_sent_embed_2": - tf.estimator.export.PredictOutput(output_sent_embed_2), - "predicted_class": - tf.estimator.export.PredictOutput(predicted_class), - "documents_match_labels": - tf.estimator.export.PredictOutput(documents_match_labels) - } - - prediction_dict = { - "predicted_score": predicted_score, - "predicted_class": predicted_class, - "documents_match_labels": documents_match_labels, - "seq_embed_1": seq_embed_1, - "seq_embed_2": seq_embed_2, - "input_sent_embed_1": input_sent_embed_1, - "input_sent_embed_2": input_sent_embed_2, - "output_sent_embed_1": output_sent_embed_1, - "output_sent_embed_2": output_sent_embed_2 - } - return (export_outputs, prediction_dict) - - -def get_pred_res_list_item_smith_de(result): - """Update the prediction results list for the dual encoder SMITH model.""" - pred_item_dict = {} - pred_item_dict["predicted_score"] = str(result["predicted_score"]) - pred_item_dict["predicted_class"] = str(result["predicted_class"]) - pred_item_dict["documents_match_labels"] = str( - result["documents_match_labels"][0]) - pred_item_dict["seq_embed_1"] = ",".join( - [str(e) for e in result["seq_embed_1"]]) - pred_item_dict["seq_embed_2"] = ",".join( - [str(e) for e in result["seq_embed_2"]]) - pred_item_dict["input_sent_embed_1"] = transfer_2d_array_to_str( - result["input_sent_embed_1"]) - pred_item_dict["input_sent_embed_2"] = transfer_2d_array_to_str( - result["input_sent_embed_2"]) - pred_item_dict["output_sent_embed_1"] = transfer_2d_array_to_str( - result["output_sent_embed_1"]) - pred_item_dict["output_sent_embed_2"] = transfer_2d_array_to_str( - result["output_sent_embed_2"]) - return pred_item_dict - - -def load_config_from_file(config_file, protobuf): - """Return the config proto loaded from config_file. - - Args: - config_file: a string to the path of a pbtxt file. - protobuf: an instance of a proto. - - Returns: - An parsed of proto with the same type of protobuf. - - Raises: - IOError: if config_file does not exist. - ParseError: if a wrong protobuf is given. - """ - if not tf.io.gfile.exists(config_file): - raise IOError("{} does not exist!".format(config_file)) - with tf.gfile.Open(config_file, "r") as reader: - proto = text_format.Parse(reader.read(), protobuf) - return proto - - -MaskedLmInstance = collections.namedtuple("MaskedLmInstance", - ["index", "label"]) - - -def create_masked_lm_predictions(tokens, masked_lm_prob, - max_predictions_per_seq, vocab_words, rng): - """Creates the predictions for the masked LM objective in preprocessing.""" - - cand_indexes = [] - for (i, token) in enumerate(tokens): - if token == "[CLS]" or token == "[SEP]": - continue - cand_indexes.append(i) - - rng.shuffle(cand_indexes) - - output_tokens = list(tokens) - - num_to_predict = min(max_predictions_per_seq, - max(1, int(round(len(tokens) * masked_lm_prob)))) - - masked_lms = [] - covered_indexes = set() - for index in cand_indexes: - if len(masked_lms) >= num_to_predict: - break - if index in covered_indexes: - continue - covered_indexes.add(index) - - masked_token = None - # 80% of the time, replace with [MASK] - if rng.random() < 0.8: - masked_token = "[MASK]" - else: - # 10% of the time, keep original - if rng.random() < 0.5: - masked_token = tokens[index] - # 10% of the time, replace with random word - else: - masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] - - output_tokens[index] = masked_token - - masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) - - masked_lms = sorted(masked_lms, key=lambda x: x.index) - - masked_lm_positions = [] - masked_lm_labels = [] - for p in masked_lms: - masked_lm_positions.append(p.index) - masked_lm_labels.append(p.label) - - return (output_tokens, masked_lm_positions, masked_lm_labels) - - -def get_majority_vote(rating_scores): - """Compute majority vote rating given a list. - - Args: - rating_scores: a list of rating scores. - - Returns: - The majority voting rating. - - """ - return collections.Counter(rating_scores).most_common()[0][0] - - -def get_mean_score(rating_scores): - """Compute the mean rating score given a list. - - Args: - rating_scores: a list of rating scores. - - Returns: - The mean rating. - - """ - return sum(rating_scores) / len(rating_scores) - - -def create_int_feature(values): - feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) - return feature - - -def create_float_feature(values): - feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) - return feature - - -def create_bytes_feature(values): - feature = tf.train.Feature(bytes_list=tf.train.BytesList(value=list(values))) - return feature - diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/wiki_doc_pair.proto b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/wiki_doc_pair.proto deleted file mode 100644 index 5c1fa2fa86237b9b02c0221b2a87186a20b0856a..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/wiki_doc_pair.proto +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2021 The Google Research Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Definition of a pair of two Wikipedia document objects. -// This proto is mainly for external data release. - -syntax = "proto2"; - -package smith; - -// Definition of a pair of two WikiDoc objects. -// NextID: 10 -message WikiDocPair { - // An id that uniquely identifies this document pair. The id can be generated - // based on the urls of the document pair. - optional string id = 1; - - // The classification label generated by machine. - // We set this as int in case we would like to change number of graded - // levels of this label. - optional int32 machine_label_for_classification = 2; - - // The classification label generated by human. - optional int32 human_label_for_classification = 3; - - // The regression label generated by machine. - optional float machine_label_for_regression = 4; - - // The regression label generated by human. - optional float human_label_for_regression = 5; - - // Two document objects with similarity labels. - optional WikiDoc doc_one = 6; - optional WikiDoc doc_two = 7; - - // The model predicted similarity score for this pair. - optional float model_prediction = 8; - - // The raw human rating scores. - repeated int32 human_label = 9; -} - -// Definition of contents in a WikiDoc objects. -// NextID: 7 -message WikiDoc { - // An id that uniquely identifies this document. The id can be generated - // based on the url of the document. - optional string id = 1; - - // The url of the WikiDoc page. - optional string url = 2; - - // The title of the WikiDoc page. - optional string title = 3; - - // The description of the WikiDoc page. - optional string description = 4; - - // The section contents of the WikiDoc page. - repeated Section section_contents = 5; - - // A list of image ids of images in the WikiDoc page. - repeated string image_ids = 6; -} - -// Definition of sections in WikiDoc pages. -// NextID: 3 -message Section { - optional string title = 1; - optional string text = 2; -} diff --git a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/wiki_doc_pair_pb2.py b/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/wiki_doc_pair_pb2.py deleted file mode 100644 index 1dbb0223b831af060765e1341cc64271499615c2..0000000000000000000000000000000000000000 --- a/TensorFlow/contrib/nlp/smith_ID2025_for_Tensorlfow/wiki_doc_pair_pb2.py +++ /dev/null @@ -1,268 +0,0 @@ -# coding=utf-8 -# Copyright 2021 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -# Generated by the protocol buffer compiler. DO NOT EDIT! -# source: smith/wiki_doc_pair.proto -"""Generated protocol buffer code.""" -from npu_bridge.npu_init import * -from google.protobuf import descriptor as _descriptor -from google.protobuf import message as _message -from google.protobuf import reflection as _reflection -from google.protobuf import symbol_database as _symbol_database -# @@protoc_insertion_point(imports) - -_sym_db = _symbol_database.Default() - - - - -DESCRIPTOR = _descriptor.FileDescriptor( - name='smith/wiki_doc_pair.proto', - package='smith', - syntax='proto2', - serialized_options=None, - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x19smith/wiki_doc_pair.proto\x12\x05smith\"\xa6\x02\n\x0bWikiDocPair\x12\n\n\x02id\x18\x01 \x01(\t\x12(\n machine_label_for_classification\x18\x02 \x01(\x05\x12&\n\x1ehuman_label_for_classification\x18\x03 \x01(\x05\x12$\n\x1cmachine_label_for_regression\x18\x04 \x01(\x02\x12\"\n\x1ahuman_label_for_regression\x18\x05 \x01(\x02\x12\x1f\n\x07\x64oc_one\x18\x06 \x01(\x0b\x32\x0e.smith.WikiDoc\x12\x1f\n\x07\x64oc_two\x18\x07 \x01(\x0b\x32\x0e.smith.WikiDoc\x12\x18\n\x10model_prediction\x18\x08 \x01(\x02\x12\x13\n\x0bhuman_label\x18\t \x03(\x05\"\x83\x01\n\x07WikiDoc\x12\n\n\x02id\x18\x01 \x01(\t\x12\x0b\n\x03url\x18\x02 \x01(\t\x12\r\n\x05title\x18\x03 \x01(\t\x12\x13\n\x0b\x64\x65scription\x18\x04 \x01(\t\x12(\n\x10section_contents\x18\x05 \x03(\x0b\x32\x0e.smith.Section\x12\x11\n\timage_ids\x18\x06 \x03(\t\"&\n\x07Section\x12\r\n\x05title\x18\x01 \x01(\t\x12\x0c\n\x04text\x18\x02 \x01(\t' -) - - - - -_WIKIDOCPAIR = _descriptor.Descriptor( - name='WikiDocPair', - full_name='smith.WikiDocPair', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='id', full_name='smith.WikiDocPair.id', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='machine_label_for_classification', full_name='smith.WikiDocPair.machine_label_for_classification', index=1, - number=2, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='human_label_for_classification', full_name='smith.WikiDocPair.human_label_for_classification', index=2, - number=3, type=5, cpp_type=1, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='machine_label_for_regression', full_name='smith.WikiDocPair.machine_label_for_regression', index=3, - number=4, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='human_label_for_regression', full_name='smith.WikiDocPair.human_label_for_regression', index=4, - number=5, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='doc_one', full_name='smith.WikiDocPair.doc_one', index=5, - number=6, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='doc_two', full_name='smith.WikiDocPair.doc_two', index=6, - number=7, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='model_prediction', full_name='smith.WikiDocPair.model_prediction', index=7, - number=8, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='human_label', full_name='smith.WikiDocPair.human_label', index=8, - number=9, type=5, cpp_type=1, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=37, - serialized_end=331, -) - - -_WIKIDOC = _descriptor.Descriptor( - name='WikiDoc', - full_name='smith.WikiDoc', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='id', full_name='smith.WikiDoc.id', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='url', full_name='smith.WikiDoc.url', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='title', full_name='smith.WikiDoc.title', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='description', full_name='smith.WikiDoc.description', index=3, - number=4, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='section_contents', full_name='smith.WikiDoc.section_contents', index=4, - number=5, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='image_ids', full_name='smith.WikiDoc.image_ids', index=5, - number=6, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=334, - serialized_end=465, -) - - -_SECTION = _descriptor.Descriptor( - name='Section', - full_name='smith.Section', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='title', full_name='smith.Section.title', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='text', full_name='smith.Section.text', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto2', - extension_ranges=[], - oneofs=[ - ], - serialized_start=467, - serialized_end=505, -) - -_WIKIDOCPAIR.fields_by_name['doc_one'].message_type = _WIKIDOC -_WIKIDOCPAIR.fields_by_name['doc_two'].message_type = _WIKIDOC -_WIKIDOC.fields_by_name['section_contents'].message_type = _SECTION -DESCRIPTOR.message_types_by_name['WikiDocPair'] = _WIKIDOCPAIR -DESCRIPTOR.message_types_by_name['WikiDoc'] = _WIKIDOC -DESCRIPTOR.message_types_by_name['Section'] = _SECTION -_sym_db.RegisterFileDescriptor(DESCRIPTOR) - -WikiDocPair = _reflection.GeneratedProtocolMessageType('WikiDocPair', (_message.Message,), { - 'DESCRIPTOR' : _WIKIDOCPAIR, - '__module__' : 'smith.wiki_doc_pair_pb2' - # @@protoc_insertion_point(class_scope:smith.WikiDocPair) - }) -_sym_db.RegisterMessage(WikiDocPair) - -WikiDoc = _reflection.GeneratedProtocolMessageType('WikiDoc', (_message.Message,), { - 'DESCRIPTOR' : _WIKIDOC, - '__module__' : 'smith.wiki_doc_pair_pb2' - # @@protoc_insertion_point(class_scope:smith.WikiDoc) - }) -_sym_db.RegisterMessage(WikiDoc) - -Section = _reflection.GeneratedProtocolMessageType('Section', (_message.Message,), { - 'DESCRIPTOR' : _SECTION, - '__module__' : 'smith.wiki_doc_pair_pb2' - # @@protoc_insertion_point(class_scope:smith.Section) - }) -_sym_db.RegisterMessage(Section) - - -# @@protoc_insertion_point(module_scope) -