diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.github/issue_template.md b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.github/issue_template.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f5140d83cac2f59bd733404904be9ef064f252c
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.github/issue_template.md
@@ -0,0 +1,25 @@
+提问时请尽可能提供如下信息：
+
+### 基本信息
+- 你使用的**操作系统**: 
+- 你使用的**Python**版本: 
+- 你使用的**Tensorflow**版本: 
+- 你使用的**Keras**版本: 
+- 你使用的**bert4keras**版本: 
+- 你使用纯**keras**还是**tf.keras**: 
+- 你加载的**预训练模型**:
+
+### 核心代码
+```python
+# 请在此处贴上你的核心代码。
+# 请尽量只保留关键部分，不要无脑贴全部代码。
+```
+
+### 输出信息
+```shell
+# 请在此处贴上你的调试输出
+```
+
+### 自我尝试
+不管什么问题，请先尝试自行解决，“万般努力”之下仍然无法解决再来提问。此处请贴上你的努力过程。
+
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/.gitignore b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..26d33521af10bcc7fd8cea344038eaaeb78d0ef5
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/bert4keras-master.iml b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/bert4keras-master.iml
new file mode 100644
index 0000000000000000000000000000000000000000..d0876a78d06ac03b5d78c8dcdb95570281c6f1d6
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/bert4keras-master.iml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/inspectionProfiles/profiles_settings.xml b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/modules.xml b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/modules.xml
new file mode 100644
index 0000000000000000000000000000000000000000..0d2972b32fba1e06ff539cac59d9ccfeea86a78a
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/bert4keras-master.iml" filepath="$PROJECT_DIR$/.idea/bert4keras-master.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/vcs.xml b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/vcs.xml
new file mode 100644
index 0000000000000000000000000000000000000000..bc59970703f937a5163639ab26909ffffe46640d
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/../../../../.." vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/LICENSE b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d645695673349e3947e8e5ae42332d0ac3164cd7
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/README.md b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bec2e55b660f5410c532c11f080c42573caae65a
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/README.md
@@ -0,0 +1,164 @@
+# bert4keras
+- Our light reimplement of bert for keras
+- 更清晰、更轻量级的keras版bert
+- 个人博客：https://kexue.fm/
+- 在线文档：http://bert4keras.spaces.ac.cn/ （还在构建中）
+
+## 说明
+这是笔者重新实现的keras版的transformer模型库，致力于用尽可能清爽的代码来实现结合transformer和keras。
+
+本项目的初衷是为了修改、定制上的方便，所以可能会频繁更新。
+
+因此欢迎star，但不建议fork，因为你fork下来的版本可能很快就过期了。
+
+## 功能
+目前已经实现：
+- 加载bert/roberta/albert的预训练权重进行finetune；
+- 实现语言模型、seq2seq所需要的attention mask；
+- 丰富的<a href="https://github.com/bojone/bert4keras/tree/master/examples">examples</a>；
+- 从零预训练代码（支持TPU、多GPU，请看<a href="https://github.com/bojone/bert4keras/tree/master/pretraining">pretraining</a>）；
+- 兼容keras、tf.keras
+
+## 使用
+安装稳定版：
+```shell
+pip install bert4keras
+```
+安装最新版：
+```shell
+pip install git+https://www.github.com/bojone/bert4keras.git
+```
+
+使用例子请参考<a href="https://github.com/bojone/bert4keras/blob/master/examples">examples</a>目录。
+
+之前基于keras-bert给出的<a href="https://github.com/bojone/bert_in_keras">例子</a>，仍适用于本项目，只需要将`bert_model`的加载方式换成本项目的。
+
+理论上兼容Python2和Python3，兼容tensorflow 1.14+和tensorflow 2.x，实验环境是Python 2.7、Tesorflow 1.14+以及Keras 2.3.1（已经在2.2.4、2.3.0、2.3.1、tf.keras下测试通过）。
+
+**为了获得最好的体验，建议你使用Tensorflow 1.14 + Keras 2.3.1组合。**
+
+<blockquote><strong>关于环境组合</strong>
+  
+- 支持tf+keras和tf+tf.keras，后者需要提前传入环境变量TF_KERAS=1。
+
+- 当使用tf+keras时，建议2.2.4 <= keras <= 2.3.1，以及 1.14 <= tf <= 2.2，不能使用tf 2.3+。
+
+- keras 2.4+可以用，但事实上keras 2.4.x基本上已经完全等价于tf.keras了，因此如果你要用keras 2.4+，倒不如直接用tf.keras。
+</blockquote>
+
+当然，乐于贡献的朋友如果发现了某些bug的话，也欢迎指出修正甚至Pull Requests～
+
+## 权重
+
+目前支持加载的权重：
+- <strong>Google原版bert</strong>: https://github.com/google-research/bert
+- <strong>brightmart版roberta</strong>: https://github.com/brightmart/roberta_zh
+- <strong>哈工大版roberta</strong>: https://github.com/ymcui/Chinese-BERT-wwm
+- <strong>Google原版albert</strong><sup><a href="https://github.com/bojone/bert4keras/issues/29#issuecomment-552188981">[例子]</a></sup>: https://github.com/google-research/ALBERT
+- <strong>brightmart版albert</strong>: https://github.com/brightmart/albert_zh
+- <strong>转换后的albert</strong>: https://github.com/bojone/albert_zh
+- <strong>华为的NEZHA</strong>: https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-TensorFlow
+- <strong>华为的NEZHA-GEN</strong>: https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-Gen-TensorFlow
+- <strong>自研语言模型</strong>: https://github.com/ZhuiyiTechnology/pretrained-models
+- <strong>T5模型</strong>: https://github.com/google-research/text-to-text-transfer-transformer
+- <strong>GPT_OpenAI</strong>: https://github.com/bojone/CDial-GPT-tf
+- <strong>GPT2_ML</strong>: https://github.com/imcaspar/gpt2-ml
+- <strong>Google原版ELECTRA</strong>: https://github.com/google-research/electra
+- <strong>哈工大版ELECTRA</strong>: https://github.com/ymcui/Chinese-ELECTRA
+- <strong>CLUE版ELECTRA</strong>: https://github.com/CLUEbenchmark/ELECTRA
+- <strong>LaBSE（多国语言BERT）</strong>: https://github.com/bojone/labse
+- <strong>Chinese-GEN项目下的模型</strong>: https://github.com/bojone/chinese-gen
+- <strong>T5.1.1</strong>: https://github.com/google-research/text-to-text-transfer-transformer/blob/master/released_checkpoints.md#t511
+- <strong>Multilingual T5</strong>: https://github.com/google-research/multilingual-t5/
+
+<strong>注意事项</strong>
+- 注1：brightmart版albert的开源时间早于Google版albert，这导致早期brightmart版albert的权重与Google版的不完全一致，换言之两者不能直接相互替换。为了减少代码冗余，bert4keras的0.2.4及后续版本均只支持加载<u>Google版</u>以brightmart版中<u>带Google字眼</u>的权重。如果要加载早期版本的权重，请用<a href="https://github.com/bojone/bert4keras/releases/tag/v0.2.3">0.2.3版本</a>，或者考虑作者转换过的<a href="https://github.com/bojone/albert_zh">albert_zh</a>。
+- 注2：下载下来的ELECTRA权重，如果没有json配置文件的话，参考<a href="https://github.com/ymcui/Chinese-ELECTRA/issues/3">这里</a>自己改一个（需要加上`type_vocab_size`字段）。
+
+## 更新
+- <strong>2022.03.20</strong>: 增加[RoFormerV2](https://kexue.fm/archives/8998)。
+- <strong>2022.02.28</strong>: 增加[GatedAttentionUnit](https://kexue.fm/archives/8934)。
+- <strong>2021.04.23</strong>: 增加[GlobalPointer](https://kexue.fm/archives/8373)。
+- <strong>2021.03.23</strong>: 增加[RoFormer](https://kexue.fm/archives/8265)。
+- <strong>2021.01.30</strong>: 发布0.9.9版，完善多GPU支持，增加多GPU例子：[task_seq2seq_autotitle_multigpu.py](https://github.com/bojone/bert4keras/blob/master/examples/task_seq2seq_autotitle_multigpu.py)。
+- <strong>2020.12.29</strong>: 增加`residual_attention_scores`参数来实现RealFormer，只需要在`build_transformer_model`中传入参数`residual_attention_scores=True`启用。
+- <strong>2020.12.04</strong>: `PositionEmbedding`引入层次分解，可以让BERT直接处理超长文本，在`build_transformer_model`中传入参数`hierarchical_position=True`启用。
+- <strong>2020.11.19</strong>: 支持GPT2模型，参考[CPM_LM_bert4keras](https://github.com/bojone/CPM_LM_bert4keras)项目。
+- <strong>2020.11.14</strong>: 新增分参数学习率`extend_with_parameter_wise_lr`，可用于给每层设置不同的学习率。
+- <strong>2020.10.27</strong>: 支持<a href="https://github.com/google-research/text-to-text-transfer-transformer/blob/master/released_checkpoints.md#t511">T5.1.1</a>和<a href="https://github.com/google-research/multilingual-t5/">Multilingual T5</a>。
+- <strong>2020.08.28</strong>: 支持<a href="https://github.com/bojone/CDial-GPT-tf">GPT_OpenAI</a>。
+- <strong>2020.08.22</strong>: 新增`WebServing`类，允许简单地将模型转换为Web接口，详情请参考该类的<a href="https://github.com/bojone/bert4keras/blob/8d55512a12e4677262363ac189ebf504fc451716/bert4keras/snippets.py#L580">说明</a>。
+- <strong>2020.07.14</strong>: `Transformer`类加入`prefix`参数；`snippets.py`引入`to_array`函数；`AutoRegressiveDecoder`修改`rtype='logits'`时的一个隐藏bug。
+- <strong>2020.06.06</strong>: 强迫症作祟：将`Tokenizer`原来的`max_length`参数重命名为`maxlen`，同时保留向后兼容性，建议大家用新参数名。
+- <strong>2020.04.29</strong>: 增加重计算（参考<a href="https://github.com/bojone/keras_recompute">keras_recompute</a>），可以通过时间换空间，通过设置环境变量`RECOMPUTE=1`启用。
+- <strong>2020.04.25</strong>: 优化tf2下的表现。
+- <strong>2020.04.16</strong>: 所有example均适配tensorflow 2.0。
+- <strong>2020.04.06</strong>: 增加UniLM预训练模式（测试中）。
+- <strong>2020.04.06</strong>: 完善`rematch`方法。
+- <strong>2020.04.01</strong>: `Tokenizer`增加`rematch`方法，给出分词结果与原序列的映射关系。
+- <strong>2020.03.30</strong>: 尽量统一py文件的写法。
+- <strong>2020.03.25</strong>: 支持ELECTRA。
+- <strong>2020.03.24</strong>: 继续加强`DataGenerator`，允许传入迭代器时进行局部shuffle。
+- <strong>2020.03.23</strong>: 增加调整Attention的`key_size`的选项。
+- <strong>2020.03.17</strong>: 增强`DataGenerator`；优化模型写法。
+- <strong>2020.03.15</strong>: 支持<a href="https://github.com/imcaspar/gpt2-ml">GPT2_ML</a>。
+- <strong>2020.03.10</strong>: 支持Google的<a href="https://github.com/google-research/text-to-text-transfer-transformer">T5</a>模型。
+- <strong>2020.03.05</strong>: 将`tokenizer.py`更名为`tokenizers.py`。
+- <strong>2020.03.05</strong>: `application='seq2seq'`改名为`application='unilm'`。
+- <strong>2020.03.05</strong>: `build_bert_model`更名为`build_transformer_model`。
+- <strong>2020.03.05</strong>: 重写`models.py`结构。
+- <strong>2020.03.04</strong>: 将`bert.py`更名为`models.py`。
+- <strong>2020.03.02</strong>: 重构mask机制（用回Keras自带的mask机制），以便更好地编写更复杂的应用。
+- <strong>2020.02.22</strong>: 新增`AutoRegressiveDecoder`类，统一处理Seq2Seq的解码问题。
+- <strong>2020.02.19</strong>: transformer block的前缀改为Transformer（本来是Encoder），使得其含义局限性更少。
+- <strong>2020.02.13</strong>: 优化`load_vocab`函数；将`build_bert_model`中的`keep_words`参数更名为`keep_tokens`，此处改动可能会对部分脚本产生影响。
+- <strong>2020.01.18</strong>: 调整文本处理方式，去掉codecs的使用。
+- <strong>2020.01.17</strong>: 各api日趋稳定，为了方便大家使用，打包到<a href="https://pypi.org/project/bert4keras/">pypi</a>，首个打包版本号为0.4.6。
+- <strong>2020.01.10</strong>: 重写模型mask方案，某种程度上让代码更为简练清晰；后端优化。
+- <strong>2019.12.27</strong>: 重构预训练代码，减少冗余；目前支持RoBERTa和GPT两种预训练方式，详见<a href="https://github.com/bojone/bert4keras/tree/master/pretraining/">pretraining</a>。
+- <strong>2019.12.17</strong>: 适配华为的<a href="https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA">nezha</a>权重，只需要在`build_bert_model`函数里加上`model='nezha'`；此外原来albert的加载方式`albert=True`改为`model='albert'`。
+- <strong>2019.12.16</strong>: 通过跟keras 2.3+版本类似的思路给低版本引入层中层功能，从而恢复对低于2.3.0版本的keras的支持。
+- <strong>2019.12.14</strong>: 新增Conditional Layer Normalization及相关demo。
+- <strong>2019.12.09</strong>: 各example的data_generator规范化；修复application='lm'时的一个错误。
+- <strong>2019.12.05</strong>: 优化tokenizer的do_lower_case，同时微调各个example。
+- <strong>2019.11.23</strong>: 将train.py重命名为optimizers.py，更新大量优化器实现，全面兼容keras和tf.keras。
+- <strong>2019.11.19</strong>: 将utils.py重命名为tokenizer.py。
+- <strong>2019.11.19</strong>: 想来想去，最后还是决定把snippets放到<a href="https://github.com/bojone/bert4keras/blob/master/bert4keras/snippets.py">bert4keras.snippets</a>下面去好了。
+- <strong>2019.11.18</strong>: 优化预训练权重加载逻辑，增加保存模型权重至Bert的checkpoint格式方法。
+- <strong>2019.11.17</strong>: <del>分离一些与Bert本身不直接相关的常用代码片段到<a href="https://github.com/bojone/python-snippets">python_snippets</a>，供其它项目共用。</del>
+- <strong>2019.11.11</strong>: 添加NSP部分。
+- <strong>2019.11.05</strong>: 适配<a href="https://github.com/google-research/google-research/tree/master/albert">google版albert</a>，不再支持<a href="https://github.com/brightmart/albert_zh">非Google版albert_zh</a>。
+- <strong>2019.11.05</strong>: 以RoBERTa为例子的预训练代码开发完毕，同时支持TPU/多GPU训练，详见<a href="https://github.com/bojone/bert4keras/tree/master/pretraining/roberta/">roberta</a>。欢迎在此基础上构建更多的预训练代码。
+- <strong>2019.11.01</strong>: 逐步增加预训练相关代码，详见<a href="https://github.com/bojone/bert4keras/tree/master/pretraining">pretraining</a>。
+- <strong>2019.10.28</strong>: 支持使用基于<a href="https://github.com/google/sentencepiece">sentencepiece</a>的tokenizer。
+- <strong>2019.10.25</strong>: 引入原生tokenizer。
+- <strong>2019.10.22</strong>: 引入梯度累积优化器。
+- <strong>2019.10.21</strong>: 为了简化代码结构，决定放弃keras 2.3.0之前的版本的支持，目前只支持keras 2.3.0+以及tf.keras。
+- <strong>2019.10.20</strong>: 应网友要求，现支持直接用`model.save`保存模型结构，用`load_model`加载整个模型（只需要在`load_model`之前执行`from bert4keras.layers import *`，不需要额外写`custom_objects`）。
+- <strong>2019.10.09</strong>: 已兼容tf.keras，同时在tf 1.13和tf 2.0下的tf.keras测试通过，通过设置环境变量`TF_KERAS=1`来切换tf.keras。
+- <strong>2019.10.09</strong>: 已兼容Keras 2.3.x，但只是临时方案，后续可能直接移除掉2.3之前版本的支持。
+- <strong>2019.10.02</strong>: 适配albert，能成功加载<a href="https://github.com/brightmart/albert_zh">albert_zh</a>的权重，只需要在`load_pretrained_model`函数里加上`albert=True`。
+
+## 背景
+之前一直用CyberZHG大佬的<a href="https://github.com/CyberZHG/keras-bert">keras-bert</a>，如果纯粹只是为了在keras下对bert进行调用和fine tune来说，keras-bert已经足够能让人满意了。
+
+然而，如果想要在加载官方预训练权重的基础上，对bert的内部结构进行修改，那么keras-bert就比较难满足我们的需求了，因为keras-bert为了代码的复用性，几乎将每个小模块都封装为了一个单独的库，比如keras-bert依赖于keras-transformer，而keras-transformer依赖于keras-multi-head，keras-multi-head依赖于keras-self-attention，这样一重重依赖下去，改起来就相当头疼了。
+
+所以，我决定重新写一个keras版的bert，争取在几个文件内把它完整地实现出来，减少这些依赖性，并且保留可以加载官方预训练权重的特性。
+
+## 鸣谢
+感谢CyberZHG大佬实现的<a href="https://github.com/CyberZHG/keras-bert">keras-bert</a>，本实现有不少地方参考了keras-bert的源码，在此衷心感谢大佬的无私奉献。
+
+## 引用
+
+```
+@misc{bert4keras,
+  title={bert4keras},
+  author={Jianlin Su},
+  year={2020},
+  howpublished={\url{https://bert4keras.spaces.ac.cn}},
+}
+```
+
+## 交流
+QQ交流群：808623966，微信群请加机器人微信号spaces_ac_cn
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/README.md b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..db6b74f2f1e5de0b1f08ad10c4cbd1cfaf4a25de
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/README.md
@@ -0,0 +1,38 @@
+# 例子合集
+
+提示：Github上的examples只保证兼容Github上的最新版bert4keras，如果报错，请首先尝试升级bert4keras。
+
+## 简介
+
+- [basic_extract_features.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_extract_features.py): 基础测试，测试BERT对句子的编码序列。
+- [basic_gibbs_sampling_via_mlm.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_gibbs_sampling_via_mlm.py): 基础测试，利用BERT+Gibbs采样进行文本随机生成，参考[这里](https://kexue.fm/archives/8119)。
+- [basic_language_model_cpm_lm.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_language_model_cpm_lm.py): 基础测试，测试[CPM_LM](https://github.com/TsinghuaAI/CPM-Generate)的生成效果。
+- [basic_language_model_gpt2_ml.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_language_model_gpt2_ml.py): 基础测试，测试[GPT2_ML](https://github.com/imcaspar/gpt2-ml)的生成效果。
+- [basic_language_model_nezha_gen_gpt.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_language_model_nezha_gen_gpt.py): 基础测试，测试[GPT Base（又叫NEZHE-GEN）](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-Gen-TensorFlow)的生成效果。
+- [basic_make_uncased_model_cased.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_make_uncased_model_cased.py): 基础测试，通过简单修改词表，使得不区分大小写的模型有区分大小写的能力。
+- [basic_masked_language_model.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_masked_language_model.py): 基础测试，测试BERT的MLM模型效果。
+- [basic_simple_web_serving_simbert.py](https://github.com/bojone/bert4keras/tree/master/examples/basic_simple_web_serving_simbert.py): 基础测试，测试自带的WebServing（将模型转化为Web接口）。
+- [task_conditional_language_model.py](https://github.com/bojone/bert4keras/tree/master/examples/task_conditional_language_model.py): 任务例子，结合 BERT + [Conditional Layer Normalization](https://kexue.fm/archives/7124) 做条件语言模型。
+- [task_iflytek_adversarial_training.py](https://github.com/bojone/bert4keras/tree/master/examples/task_iflytek_adversarial_training.py): 任务例子，通过[对抗训练](https://kexue.fm/archives/7234)提升分类效果。
+- [task_iflytek_bert_of_theseus.py](https://github.com/bojone/bert4keras/tree/master/examples/task_iflytek_bert_of_theseus.py): 任务例子，通过[BERT-of-Theseus](https://kexue.fm/archives/7575)来进行模型压缩。
+- [task_iflytek_gradient_penalty.py](https://github.com/bojone/bert4keras/tree/master/examples/task_iflytek_gradient_penalty.py): 任务例子，通过[梯度惩罚](https://kexue.fm/archives/7234)提升分类效果，可以视为另一种对抗训练。
+- [task_iflytek_multigpu.py](https://github.com/bojone/bert4keras/tree/master/examples/task_iflytek_multigpu.py): 任务例子，文本分类多GPU版。
+- [task_image_caption.py](https://github.com/bojone/bert4keras/tree/master/examples/task_image_caption.py): 任务例子，BERT + [Conditional Layer Normalization](https://kexue.fm/archives/7124) + ImageNet预训练模型 来做图像描述生成。
+- [task_language_model.py](https://github.com/bojone/bert4keras/tree/master/examples/task_language_model.py): 任务例子，加载BERT的预训练权重做无条件语言模型，效果上等价于GPT。
+- [task_language_model_chinese_chess.py](https://github.com/bojone/bert4keras/tree/master/examples/task_language_model_chinese_chess.py): 任务例子，用GPT的方式下中国象棋，过程请参考[博客](https://kexue.fm/archives/7877)。
+- [task_question_answer_generation_by_seq2seq.py](https://github.com/bojone/bert4keras/tree/master/examples/task_question_answer_generation_by_seq2seq.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做[问答对自动构建](https://kexue.fm/archives/7630)，属于自回归文本生成。
+- [task_reading_comprehension_by_mlm.py](https://github.com/bojone/bert4keras/tree/master/examples/task_reading_comprehension_by_mlm.py): 任务例子，通过MLM模型来做[阅读理解问答](https://kexue.fm/archives/7148)，属于简单的非自回归文本生成。
+- [task_reading_comprehension_by_seq2seq.py](https://github.com/bojone/bert4keras/tree/master/examples/task_reading_comprehension_by_seq2seq.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做[阅读理解问答](https://kexue.fm/archives/7115)，属于自回归文本生成。
+- [task_relation_extraction.py](https://github.com/bojone/bert4keras/tree/master/examples/task_relation_extraction.py): 任务例子，结合BERT以及自行设计的“半指针-半标注”结构来做[关系抽取](https://kexue.fm/archives/7161)。
+- [task_sentence_similarity_lcqmc.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentence_similarity_lcqmc.py): 任务例子，句子对分类任务。
+- [task_sentiment_albert.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentiment_albert.py): 任务例子，情感分类任务，加载ALBERT模型。
+- [task_sentiment_integrated_gradients.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentiment_integrated_gradients.py): 任务例子，通过[积分梯度](https://kexue.fm/archives/7533)的方式可视化情感分类任务。
+- [task_sentiment_virtual_adversarial_training.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sentiment_virtual_adversarial_training.py): 任务例子，通过[虚拟对抗训练](https://kexue.fm/archives/7466)进行半监督学习，提升小样本下的情感分类性能。
+- [task_seq2seq_ape210k_math_word_problem.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_ape210k_math_word_problem.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做小学数学应用题（数学公式生成），详情请见[这里](https://kexue.fm/archives/7809)。
+- [task_seq2seq_autotitle.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_autotitle.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做新闻标题生成。
+- [task_seq2seq_autotitle_csl.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_autotitle_csl.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做论文标题生成，包含了评测代码。
+- [task_seq2seq_autotitle_csl_mt5.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_autotitle_csl_mt5.py): 任务例子，通过[多国语言版T5](https://kexue.fm/archives/7867)式的Seq2Seq模型来做论文标题生成，包含了评测代码。
+- [task_seq2seq_autotitle_multigpu.py](https://github.com/bojone/bert4keras/tree/master/examples/task_seq2seq_autotitle_multigpu.py): 任务例子，通过[UniLM](https://kexue.fm/archives/6933)式的Seq2Seq模型来做新闻标题生成，单机多卡版本。
+- [task_sequence_labeling_cws_crf.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sequence_labeling_cws_crf.py): 任务例子，通过 BERT + [CRF](https://kexue.fm/archives/7196) 来做中文分词。
+- [task_sequence_labeling_ner_crf.py](https://github.com/bojone/bert4keras/tree/master/examples/task_sequence_labeling_ner_crf.py): 
+任务例子，通过 BERT + [CRF](https://kexue.fm/archives/7196) 来做中文NER。
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/__init__.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_extract_features.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_extract_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..ede248f48e01388fffac3651a4ec6987cbaaf374
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_extract_features.py
@@ -0,0 +1,72 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 测试代码可用性: 提取特征
+
+import numpy as np
+from bert4keras.backend import keras
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import to_array
+
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+model = build_transformer_model(config_path, checkpoint_path)  # 建立模型，加载权重
+
+# 编码测试
+token_ids, segment_ids = tokenizer.encode(u'语言模型')
+token_ids, segment_ids = to_array([token_ids], [segment_ids])
+
+print('\n ===== predicting =====\n')
+print(model.predict([token_ids, segment_ids]))
+"""
+输出：
+[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
+    0.2575253 ]
+  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
+    0.03881441]
+  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
+    0.08222899]
+  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
+    0.5369075 ]
+  [-0.7473459   0.49431565  0.7185162  ...  0.3848612  -0.74090636
+    0.39056838]
+  [-0.8741375  -0.21650358  1.338839   ...  0.5816864  -0.4373226
+    0.56181806]]]
+"""
+
+print('\n ===== reloading and predicting =====\n')
+model.save('test.model')
+del model
+model = keras.models.load_model('test.model')
+print(model.predict([token_ids, segment_ids]))
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_gibbs_sampling_via_mlm.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_gibbs_sampling_via_mlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..741ea4f05a7a15b572b499f5e6dbcc251b3efab8
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_gibbs_sampling_via_mlm.py
@@ -0,0 +1,75 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 测试代码可用性: 结合MLM的Gibbs采样
+
+from tqdm import tqdm
+import numpy as np
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import to_array
+
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+model = build_transformer_model(
+    config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True
+)  # 建立模型，加载权重
+
+sentences = []
+init_sent = u'科学技术是第一生产力。'  # 给定句子或者None
+minlen, maxlen = 8, 32
+steps = 10000
+converged_steps = 1000
+vocab_size = tokenizer._vocab_size
+
+if init_sent is None:
+    length = np.random.randint(minlen, maxlen + 1)
+    tokens = ['[CLS]'] + ['[MASK]'] * length + ['[SEP]']
+    token_ids = tokenizer.tokens_to_ids(tokens)
+    segment_ids = [0] * len(token_ids)
+else:
+    token_ids, segment_ids = tokenizer.encode(init_sent)
+    length = len(token_ids) - 2
+
+for _ in tqdm(range(steps), desc='Sampling'):
+    # Gibbs采样流程：随机mask掉一个token，然后通过MLM模型重新采样这个token。
+    i = np.random.choice(length) + 1
+    token_ids[i] = tokenizer._token_mask_id
+    probas = model.predict(to_array([token_ids], [segment_ids]))[0, i]
+    token = np.random.choice(vocab_size, p=probas)
+    token_ids[i] = token
+    sentences.append(tokenizer.decode(token_ids))
+
+print(u'部分随机采样结果：')
+for _ in range(10):
+    print(np.random.choice(sentences[converged_steps:]))
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_language_model_cpm_lm.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_language_model_cpm_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9429a1e3d6d05573dea7d78e139f32656ea05778
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_language_model_cpm_lm.py
@@ -0,0 +1,150 @@
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 基本测试：清华开源的中文GPT2模型（26亿参数）
+# 项目链接：https://github.com/TsinghuaAI/CPM-Generate
+# 博客介绍：https://kexue.fm/archives/7912
+
+import numpy as np
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import SpTokenizer
+from bert4keras.snippets import AutoRegressiveDecoder
+from bert4keras.snippets import uniout
+import jieba
+jieba.initialize()
+
+# 模型路径
+config_path = '/root/kg/bert/CPM_LM_2.6B_TF/config.json'
+checkpoint_path = '/root/kg/bert/CPM_LM_2.6B_TF/model.ckpt'
+spm_path = '/root/kg/bert/CPM_LM_2.6B_TF/chinese_vocab.model'
+
+
+def pre_tokenize(text):
+    """分词前处理函数
+    """
+    return [
+        w.replace(' ', u'\u2582').replace('\n', u'\u2583')
+        for w in jieba.cut(text, cut_all=False)
+    ]
+
+
+tokenizer = SpTokenizer(
+    spm_path,
+    token_start=None,
+    token_end=None,
+    pre_tokenize=pre_tokenize,
+    token_translate={u'\u2583': '<cls>'}
+)  # 建立分词器
+
+model = build_transformer_model(
+    config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2'
+)  # 建立模型，加载权重
+
+
+class TextExpansion(AutoRegressiveDecoder):
+    """基于随机采样的文本续写
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids = np.concatenate([inputs[0], output_ids], 1)
+        return self.last_token(model).predict(token_ids)
+
+    def generate(self, text, n=1, topp=0.95, temperature=1):
+        """输出结果会有一定的随机性，如果只关心Few Shot效果，
+        可以考虑将解码方式换为beam search。
+        """
+        token_ids, _ = tokenizer.encode(text)
+        results = self.random_sample([token_ids],
+                                     n,
+                                     topp=topp,
+                                     temperature=temperature)  # 基于随机采样
+        results = [token_ids + [int(i) for i in ids] for ids in results]
+        texts = [tokenizer.decode(ids) for ids in results]
+        return [self.post_replace(text) for text in texts]
+
+    def post_replace(self, text):
+        for s, t in [(' ', ''), (u'\u2582', ' '), (u'\u2583', '\n')]:
+            text = text.replace(s, t)
+        return text
+
+
+text_expansion = TextExpansion(
+    start_id=None,
+    end_id=3,  # 3是<cls>，也是换行符
+    maxlen=16,
+)
+
+# 常识推理
+# 本例输出：北京
+query = u"""
+美国的首都是华盛顿
+法国的首都是巴黎
+日本的首都是东京
+中国的首都是
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
+
+# 单词翻译
+# 本例输出：bird
+query = u"""
+狗 dog
+猫 cat
+猪 pig
+鸟 
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
+
+# 主语抽取
+# 本例输出：杨振宁
+query = u"""
+从1931年起，华罗庚在清华大学边学习边工作 华罗庚
+在一间简陋的房间里，陈景润攻克了“哥德巴赫猜想” 陈景润
+在这里，丘成桐得到IBM奖学金 丘成桐
+杨振宁在粒子物理学、统计力学和凝聚态物理等领域作出里程碑性贡献 
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
+
+# 三元组抽取
+# 本例输出：张红,体重,140斤
+query = u"""
+姚明的身高是211cm，是很多人心目中的偶像。 ->姚明，身高，211cm
+毛泽东是绍兴人，早年在长沙读书。->毛泽东，出生地，绍兴
+虽然周杰伦在欧洲办的婚礼，但是他是土生土长的中国人->周杰伦，国籍，中国
+小明出生于武汉，但是却不喜欢在武汉生成，长大后去了北京。->小明，出生地，武汉
+吴亦凡是很多人的偶像，但是他却是加拿大人，另很多人失望->吴亦凡，国籍，加拿大
+武耀的生日在5月8号，这一天，大家都为他庆祝了生日->武耀，生日，5月8号
+《青花瓷》是周杰伦最得意的一首歌。->周杰伦，作品，《青花瓷》
+北京是中国的首都。->中国，首都，北京
+蒋碧的家乡在盘龙城，毕业后去了深圳工作。->蒋碧，籍贯，盘龙城
+上周我们和王立一起去了他的家乡云南玩昨天才回到了武汉。->王立，籍贯，云南
+昨天11月17号，我和朋友一起去了海底捞，期间服务员为我的朋友刘章庆祝了生日。->刘章，生日，11月17号
+张红的体重达到了140斤，她很苦恼。->
+"""
+print(text_expansion.generate(query[1:-1], 1)[0])
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_language_model_gpt2_ml.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_language_model_gpt2_ml.py
new file mode 100644
index 0000000000000000000000000000000000000000..e239d78fb1ef438954dc111dd96910bf82751f1a
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_language_model_gpt2_ml.py
@@ -0,0 +1,115 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 基本测试：中文GPT2_ML模型
+# 介绍链接：https://kexue.fm/archives/7292
+
+import numpy as np
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import AutoRegressiveDecoder
+from bert4keras.snippets import uniout
+
+config_path = '/root/kg/bert/gpt2_ml/config.json'
+checkpoint_path = '/root/kg/bert/gpt2_ml/model.ckpt-100000'
+dict_path = '/root/kg/bert/gpt2_ml/vocab.txt'
+
+tokenizer = Tokenizer(
+    dict_path, token_start=None, token_end=None, do_lower_case=True
+)  # 建立分词器
+
+model = build_transformer_model(
+    config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2_ml'
+)  # 建立模型，加载权重
+
+
+class ArticleCompletion(AutoRegressiveDecoder):
+    """基于随机采样的文章续写
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids = np.concatenate([inputs[0], output_ids], 1)
+        return self.last_token(model).predict(token_ids)
+
+    def generate(self, text, n=1, topp=0.95):
+        token_ids, _ = tokenizer.encode(text)
+        results = self.random_sample([token_ids], n, topp=topp)  # 基于随机采样
+        return [text + tokenizer.decode(ids) for ids in results]
+
+
+article_completion = ArticleCompletion(
+    start_id=None,
+    end_id=511,  # 511是中文句号
+    maxlen=256,
+    minlen=128
+)
+
+print(article_completion.generate(u'今天天气不错'))
+"""
+部分结果：
+
+>>> article_completion.generate(u'今天天气不错')
+[u'今天天气不错，可以去跑步。昨晚看了一个关于跑步的纪录片，里面的女主讲述的是一个女孩子的成长，很励志，也很美丽。我也想跑，但是我不知道跑步要穿运动鞋，所以就买了一双运动鞋。这个纪录片是关于运动鞋的，有一 集讲了一个女孩子，从小学开始就没有穿过运动鞋，到了高中才开始尝试跑步。']
+
+>>> article_completion.generate(u'双十一')
+[u'双十一马上就要到了！你还在为双11的物流配送而担心吗？你还在为没时间去仓库取货而发愁吗？你还在为不知道怎么买到便宜货而发愁吗？你还在为买不到心仪的产品而懊恼吗？那么，双十一就来了！今天小编带你来看看这些 快递，都是怎么送货的！1. 物流配送快递公司的配送，主要是由快递公司负责，快递公司负责派件，物流服务。']
+
+>>> article_completion.generate(u'科学空间')
+[u'科学空间站科学空间站（英文：science space station），是中华人民共和国的一个空间站。该空间站是中国科学院大连物理研究所研制，主要研发和使用中国科学院大连物理研究所的核能动力空间站。科学空间站位于北京市海淀区，距离地面393米，总建筑面积约为1万平方米，总投资约为5亿元人民币。科学空间站于2018年12月26日开始动工，2021年6月建成并投入使用。']
+"""
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_language_model_nezha_gen_gpt.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_language_model_nezha_gen_gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..667eb69fec971d584725dcc11115316ec643dc5c
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_language_model_nezha_gen_gpt.py
@@ -0,0 +1,116 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 基本测试：中文GPT模型，base版本，华为开源的
+# 权重链接: https://pan.baidu.com/s/1-FB0yl1uxYDCGIRvU1XNzQ 提取码: xynn
+# 参考项目：https://github.com/bojone/chinese-gen
+
+import numpy as np
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import AutoRegressiveDecoder
+from bert4keras.snippets import uniout
+
+config_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/config.json'
+checkpoint_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/gpt.ckpt'
+dict_path = '/root/kg/bert/chinese_nezha_gpt_L-12_H-768_A-12/vocab.txt'
+
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+
+model = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    segment_vocab_size=0,  # 去掉segment_ids输入
+    application='lm',
+)  # 建立模型，加载权重
+
+
+class ArticleCompletion(AutoRegressiveDecoder):
+    """基于随机采样的文章续写
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids = np.concatenate([inputs[0], output_ids], 1)
+        return self.last_token(model).predict(token_ids)
+
+    def generate(self, text, n=1, topp=0.95):
+        token_ids = tokenizer.encode(text)[0][:-1]
+        results = self.random_sample([token_ids], n, topp=topp)  # 基于随机采样
+        return [text + tokenizer.decode(ids) for ids in results]
+
+
+article_completion = ArticleCompletion(
+    start_id=None,
+    end_id=511,  # 511是中文句号
+    maxlen=256,
+    minlen=128
+)
+
+print(article_completion.generate(u'今天天气不错'))
+"""
+部分结果：
+>>> article_completion.generate(u'今天天气不错')
+[u'今天天气不错。昨天的天气是多云到晴的天气，今天的天气还不错，不会太冷。明后两天天气还是比较好的。不过今天的天气比较闷热，最高温度在30℃左右，明后两天天气会更加热。预计今天的最高温度为30℃，明后两天的最   高温度为32℃左右，今天的最高气温将在30℃左右。（记者李莉）。新华网重庆频道诚邀广大网友投稿，您可以用相机或手机记录下身边的感人故事，精彩瞬间。请将作者、拍摄时间、地点和简要说明连同照片发给我们，我们将精选其中的好图、美图在页面上展示，让所有新华网友共赏。[投稿] 。本报讯(记者陈敏华) 今年上半年，重庆市各级公安机关在全力抓好']
+
+>>> article_completion.generate(u'双十一')
+[u'双十一大是中国共产党在新的历史起点上召开的一次十分重要的代表大会, 是全面落实科学发展观、推进中国特色社会主义伟大事业的一次重要会议。会议的召开, 是党和政府对新世纪新阶段我国改革开放和社会主义现代化建设 事业的新的历史任务的一次重要总动员, 必将对我们党全面推进党的建']
+
+>>> article_completion.generate(u'科学空间')
+[u'科学空间站上的两个机器人在进入轨道后，一边在轨道上工作，一边用它们的身体和心脏在空间站上的一个大气层进行活动，以确保它们在进入地球之后不会因太阳风暴而受到影响；而另外一个机器人则在进入轨道的过程中，通 过机器人与地球上的大气层相互作用，使地球的大气层不断地向地球的大气层中转移，以使其能够在空间站上工作，并且使用它们的身体和心脏来完成它们的各种任务。']
+"""
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_make_uncased_model_cased.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_make_uncased_model_cased.py
new file mode 100644
index 0000000000000000000000000000000000000000..462d538008e111c38007d5e7fc205717650c968e
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_make_uncased_model_cased.py
@@ -0,0 +1,93 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 通过简单修改词表，使得不区分大小写的模型有区分大小写的能力
+# 基本思路：将英文单词大写化后添加到词表中，并修改模型Embedding层
+
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.snippets import to_array
+import numpy as np
+
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+token_dict = load_vocab(dict_path)
+new_token_dict = token_dict.copy()
+compound_tokens = []
+
+for t, i in sorted(token_dict.items(), key=lambda s: s[1]):
+    # 这里主要考虑两种情况：1、首字母大写；2、整个单词大写。
+    # Python2下，新增了5594个token；Python3下，新增了5596个token。
+    tokens = []
+    if t.isalpha():
+        tokens.extend([t[:1].upper() + t[1:], t.upper()])
+    elif t[:2] == '##' and t[2:].isalpha():
+        tokens.append(t.upper())
+    for token in tokens:
+        if token not in new_token_dict:
+            compound_tokens.append([i])
+            new_token_dict[token] = len(new_token_dict)
+
+tokenizer = Tokenizer(new_token_dict, do_lower_case=False)
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    compound_tokens=compound_tokens,  # 增加新token，用旧token平均来初始化
+)
+
+text = u'Welcome to BEIJING.'
+tokens = tokenizer.tokenize(text)
+print(tokens)
+"""
+输出：['[CLS]', u'Welcome', u'to', u'BE', u'##I', u'##JING', u'.', '[SEP]']
+"""
+
+token_ids, segment_ids = tokenizer.encode(text)
+token_ids, segment_ids = to_array([token_ids], [segment_ids])
+print(model.predict([token_ids, segment_ids]))
+"""
+输出：
+[[[-1.4999904e-01  1.9651388e-01 -1.7924258e-01 ...  7.8269649e-01
+    2.2241375e-01  1.1325148e-01]
+  [-4.5268752e-02  5.5090344e-01  7.4699545e-01 ... -4.7773960e-01
+   -1.7562288e-01  4.1265407e-01]
+  [ 7.0158571e-02  1.7816302e-01  3.6949167e-01 ...  9.6258509e-01
+   -8.4678203e-01  6.3776302e-01]
+  ...
+  [ 9.3637377e-01  3.0232478e-02  8.1411439e-01 ...  7.9186147e-01
+    7.5704646e-01 -8.3475001e-04]
+  [ 2.3699696e-01  2.9953337e-01  8.1962071e-02 ... -1.3776925e-01
+    3.8681498e-01  3.2553676e-01]
+  [ 1.9728680e-01  7.7782705e-02  5.2951699e-01 ...  8.9622810e-02
+   -2.3932748e-02  6.9600858e-02]]]
+"""
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_masked_language_model.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_masked_language_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..12a73b5a1526180679c30d0a67da6b8362e4f806
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_masked_language_model.py
@@ -0,0 +1,55 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 测试代码可用性: MLM
+
+import numpy as np
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import to_array
+
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+model = build_transformer_model(
+    config_path=config_path, checkpoint_path=checkpoint_path, with_mlm=True
+)  # 建立模型，加载权重
+
+token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')
+
+# mask掉“技术”
+token_ids[3] = token_ids[4] = tokenizer._token_mask_id
+token_ids, segment_ids = to_array([token_ids], [segment_ids])
+
+# 用mlm模型预测被mask掉的部分
+probas = model.predict([token_ids, segment_ids])[0]
+print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_simple_web_serving_simbert.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_simple_web_serving_simbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9a401121f12075a87a8d14238669374d6f867ac
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/basic_simple_web_serving_simbert.py
@@ -0,0 +1,115 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 利用自带的接口，将SimBERT的同义句生成搭建成Web服务。
+# 基于bottlepy简单封装，仅作为临时测试使用，不保证性能。
+# 目前仅保证支持 Tensorflow 1.x + Keras <= 2.3.1。
+# 具体用法请看 https://github.com/bojone/bert4keras/blob/8ffb46a16a79f87aa8cdf045df7994036b4be47d/bert4keras/snippets.py#L580
+
+import numpy as np
+from collections import Counter
+from bert4keras.backend import keras, K
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.snippets import sequence_padding, AutoRegressiveDecoder
+from bert4keras.snippets import WebServing
+
+maxlen = 32
+
+# bert配置
+config_path = '/root/kg/bert/chinese_simbert_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_simbert_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_simbert_L-12_H-768_A-12/vocab.txt'
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
+
+# 建立加载模型
+bert = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    with_pool='linear',
+    application='unilm',
+    return_keras_model=False,
+)
+
+encoder = keras.models.Model(bert.model.inputs, bert.model.outputs[0])
+seq2seq = keras.models.Model(bert.model.inputs, bert.model.outputs[1])
+
+
+class SynonymsGenerator(AutoRegressiveDecoder):
+    """seq2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids, segment_ids = inputs
+        token_ids = np.concatenate([token_ids, output_ids], 1)
+        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
+        return self.last_token(seq2seq).predict([token_ids, segment_ids])
+
+    def generate(self, text, n=1, topp=0.95):
+        token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+        output_ids = self.random_sample([token_ids, segment_ids], n,
+                                        topp=topp)  # 基于随机采样
+        return [tokenizer.decode(ids) for ids in output_ids]
+
+
+synonyms_generator = SynonymsGenerator(
+    start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen
+)
+
+
+def gen_synonyms(text, n=100, k=20):
+    """"含义： 产生sent的n个相似句，然后返回最相似的k个。
+    做法：用seq2seq生成，并用encoder算相似度并排序。
+    """
+    r = synonyms_generator.generate(text, n)
+    r = [i for i in set(r) if i != text]
+    r = [text] + r
+    X, S = [], []
+    for t in r:
+        x, s = tokenizer.encode(t)
+        X.append(x)
+        S.append(s)
+    X = sequence_padding(X)
+    S = sequence_padding(S)
+    Z = encoder.predict([X, S])
+    Z /= (Z**2).sum(axis=1, keepdims=True)**0.5
+    argsort = np.dot(Z[1:], -Z[0]).argsort()
+    return [r[i + 1] for i in argsort[:k]]
+
+
+if __name__ == '__main__':
+
+    arguments = {'text': (None, True), 'n': (int, False), 'k': (int, False)}
+    web = WebServing(port=8864)
+    web.route('/gen_synonyms', gen_synonyms, arguments)
+    web.start()
+    # 现在可以测试访问 http://127.0.0.1:8864/gen_synonyms?text=苹果多少钱一斤
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/__init__.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6a79e948b5a26a44615f79cd4fcdcf2e0a6f4c6
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/__init__.py
@@ -0,0 +1,30 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+__version__ = '0.11.3'
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/backend.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a21c8906bd834e4cde33b97e248ea99a3abb4e7
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/backend.py
@@ -0,0 +1,536 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# -*- coding: utf-8 -*-
+# 分离后端函数，主要是为了同时兼容原生keras和tf.keras
+# 通过设置环境变量TF_KERAS=1来切换tf.keras
+
+import os, sys
+from distutils.util import strtobool
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.client import device_lib
+from tensorflow.python.util import nest, tf_inspect
+from tensorflow.python.eager import tape
+from tensorflow.python.ops.custom_gradient import _graph_mode_decorator
+
+# 判断是tf.keras还是纯keras的标记
+is_tf_keras = strtobool(os.environ.get('TF_KERAS', '0'))
+
+if is_tf_keras:
+    sys.modules['keras'] = tf.keras
+
+import keras
+import keras.backend as K
+
+# 判断是否启用重计算（通过时间换空间）
+do_recompute = strtobool(os.environ.get('RECOMPUTE', '0'))
+
+
+def get_available_gpus():
+    """获取可用的GPU列表
+    """
+    devices = device_lib.list_local_devices()
+    devices = [x.name for x in devices if x.device_type == 'GPU']
+    return devices
+
+
+def gelu_erf(x):
+    """基于Erf直接计算的gelu函数
+    """
+    return 0.5 * x * (1.0 + tf.math.erf(x / np.sqrt(2.0)))
+
+
+def gelu_tanh(x):
+    """基于Tanh近似计算的gelu函数
+    """
+    cdf = 0.5 * (
+        1.0 + K.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * K.pow(x, 3))))
+    )
+    return x * cdf
+
+
+def set_gelu(version):
+    """设置gelu版本
+    """
+    version = version.lower()
+    assert version in ['erf', 'tanh'], 'gelu version must be erf or tanh'
+    if version == 'erf':
+        keras.utils.get_custom_objects()['gelu'] = gelu_erf
+    else:
+        keras.utils.get_custom_objects()['gelu'] = gelu_tanh
+
+
+def infinity():
+    """返回默认的代表无穷大的数值
+    """
+    return keras.utils.get_custom_objects().get('infinity', 1e12)
+
+
+def set_infinity(value):
+    """设置新的代表无穷大的数值
+    """
+    keras.utils.get_custom_objects()['infinity'] = value
+
+
+def piecewise_linear(t, schedule, from_zero=True):
+    """分段线性函数
+    其中schedule是形如{1000: 1, 2000: 0.1}的字典，
+    表示 t ∈ [0, 1000]时，输出从0均匀增加至1，而
+    t ∈ [1000, 2000]时，输出从1均匀降低到0.1，最后
+    t > 2000时，保持0.1不变。
+    """
+    schedule = sorted(schedule.items())
+    if from_zero and schedule[0][0] != 0:
+        schedule = [(0, 0.0)] + schedule
+
+    t = K.cast(t, K.floatx())
+    x = (t * 0 + 1) * schedule[0][1]
+    for i in range(len(schedule)):
+        t_begin = schedule[i][0]
+        x_begin = x
+        if i != len(schedule) - 1:
+            dx = schedule[i + 1][1] - schedule[i][1]
+            dt = schedule[i + 1][0] - schedule[i][0]
+            slope = 1.0 * dx / dt
+            x = schedule[i][1] + slope * (t - t_begin)
+        else:
+            x = (t * 0 + 1) * schedule[i][1]
+        x = K.switch(t >= t_begin, x, x_begin)
+
+    return x
+
+
+def search_layer(inputs, name, exclude_from=None):
+    """根据inputs和name来搜索层
+    说明：inputs为某个层或某个层的输出；name为目标层的名字。
+    实现：根据inputs一直往上递归搜索，直到发现名字为name的层为止；
+         如果找不到，那就返回None。
+    """
+    if exclude_from is None:
+        exclude_from = set()
+
+    if isinstance(inputs, keras.layers.Layer):
+        layer = inputs
+    else:
+        layer = inputs._keras_history[0]
+
+    if layer.name == name:
+        return layer
+    elif layer in exclude_from:
+        return None
+    else:
+        exclude_from.add(layer)
+        if isinstance(layer, keras.models.Model):
+            model = layer
+            for layer in model.layers:
+                if layer.name == name:
+                    return layer
+        inbound_layers = layer._inbound_nodes[0].inbound_layers
+        if not isinstance(inbound_layers, list):
+            inbound_layers = [inbound_layers]
+        if len(inbound_layers) > 0:
+            for layer in inbound_layers:
+                layer = search_layer(layer, name, exclude_from)
+                if layer is not None:
+                    return layer
+
+
+def align(tensor, axes, ndim=None):
+    """重新对齐tensor（批量版expand_dims）
+    axes：原来的第i维对齐新tensor的第axes[i]维；
+    ndim：新tensor的维度。
+    """
+    assert len(axes) == K.ndim(tensor)
+    assert ndim or min(axes) >= 0
+    ndim = ndim or max(axes) + 1
+    indices = [None] * ndim
+    for i in axes:
+        indices[i] = slice(None)
+    return tensor[indices]
+
+
+def reshape(tensor, *args):
+    """实现更灵活的reshape
+    其中 *args 为 (shape1, axis1, shape2, axis2, ...) 格式，表示将
+    维度axis1转换为shape1、维度axis2转换为shape2、...
+    """
+    if len(args) == 1:
+        return tf.reshape(tensor, args[0])
+    assert len(args) % 2 == 0
+    shape = K.shape(tensor)
+    shape = [[s or shape[i]] for i, s in enumerate(K.int_shape(tensor))]
+    for s, i in zip(args[::2], args[1::2]):
+        s = list(s)
+        assert s.count(-1) <= 1
+        if s.count(-1) == 1:
+            j = s.index(-1)
+            s[j] = -shape[i][0] // K.prod(s)
+        shape[i] = s
+    return tf.reshape(tensor, [i for s in shape for i in s])
+
+
+def flatten(tensor, start=None, end=None):
+    """将tensor从start到end的维度展平
+    """
+    start, end = start or 0, end or K.ndim(tensor)
+    shape = K.shape(tensor)
+    shape = [s or shape[i] for i, s in enumerate(K.int_shape(tensor))]
+    shape = shape[:start] + [K.prod(shape[start:end])] + shape[end:]
+    return K.reshape(tensor, shape)
+
+
+def sequence_masking(x, mask, value=0, axis=None):
+    """为序列条件mask的函数
+    mask: 形如(batch_size, seq_len)的0-1矩阵；
+    value: mask部分要被替换成的值，可以是'-inf'或'inf'；
+    axis: 序列所在轴，默认为1；
+    """
+    if mask is None:
+        return x
+    else:
+        x_dtype = K.dtype(x)
+        if x_dtype == 'bool':
+            x = K.cast(x, 'int32')
+        if K.dtype(mask) != K.dtype(x):
+            mask = K.cast(mask, K.dtype(x))
+        if value == '-inf':
+            value = -K.infinity()
+        elif value == 'inf':
+            value = K.infinity()
+        if axis is None:
+            axis = 1
+        elif axis < 0:
+            axis = K.ndim(x) + axis
+        assert axis > 0, 'axis must be greater than 0'
+        mask = align(mask, [0, axis], K.ndim(x))
+        value = K.cast(value, K.dtype(x))
+        x = x * mask + value * (1 - mask)
+        if x_dtype == 'bool':
+            x = K.cast(x, 'bool')
+        return x
+
+
+def batch_gather(params, indices):
+    """同tf旧版本的batch_gather
+    """
+    if K.dtype(indices)[:3] != 'int':
+        indices = K.cast(indices, 'int32')
+
+    try:
+        return tf.gather(params, indices, batch_dims=K.ndim(indices) - 1)
+    except Exception as e1:
+        try:
+            return tf.batch_gather(params, indices)
+        except Exception as e2:
+            raise ValueError('%s\n%s\n' % (e1.message, e2.message))
+
+
+def pool1d(
+    x,
+    pool_size,
+    strides=1,
+    padding='valid',
+    data_format=None,
+    pool_mode='max'
+):
+    """向量序列的pool函数
+    """
+    x = K.expand_dims(x, 1)
+    x = K.pool2d(
+        x,
+        pool_size=(1, pool_size),
+        strides=(1, strides),
+        padding=padding,
+        data_format=data_format,
+        pool_mode=pool_mode
+    )
+    return x[:, 0]
+
+
+def divisible_temporal_padding(x, n):
+    """将一维向量序列右padding到长度能被n整除
+    """
+    r_len = K.shape(x)[1] % n
+    p_len = K.switch(r_len > 0, n - r_len, 0)
+    return K.temporal_padding(x, (0, p_len))
+
+
+def root_mean_square(x, axis=None, keepdims=False):
+    """均方根，相当于模长的变体
+    """
+    return K.sqrt(K.mean(K.square(x), axis=axis, keepdims=keepdims))
+
+
+def swish(x):
+    """swish函数（这样封装过后才有 __name__ 属性）
+    """
+    return tf.nn.swish(x)
+
+
+def leaky_relu(x, alpha=0.2):
+    """leaky relu函数（这样封装过后才有 __name__ 属性）
+    """
+    return tf.nn.leaky_relu(x, alpha=alpha)
+
+
+def attention_normalize(a, axis=-1, method='softmax'):
+    """不同的注意力归一化方案
+    softmax：常规/标准的指数归一化；
+    squared_relu：来自 https://arxiv.org/abs/2202.10447 ；
+    softmax_plus：来自 https://kexue.fm/archives/8823 。
+    """
+    if method == 'softmax':
+        return K.softmax(a, axis=axis)
+    else:
+        mask = K.cast(a > -K.infinity() / 10, K.floatx())
+        l = K.maximum(K.sum(mask, axis=axis, keepdims=True), 1)
+        if method == 'squared_relu':
+            return K.relu(a)**2 / l
+        elif method == 'softmax_plus':
+            scale = K.log(l) / np.log(512) * mask + 1 - mask
+            return K.softmax(a * scale, axis=axis)
+    return a
+
+
+class Sinusoidal(keras.initializers.Initializer):
+    """Sin-Cos位置向量初始化器
+    来自：https://arxiv.org/abs/1706.03762
+    """
+    def __call__(self, shape, dtype=None):
+        """Sin-Cos形式的位置向量
+        """
+        vocab_size, depth = shape
+        embeddings = np.zeros(shape)
+        for pos in range(vocab_size):
+            for i in range(depth // 2):
+                theta = pos / np.power(10000, 2. * i / depth)
+                embeddings[pos, 2 * i] = np.sin(theta)
+                embeddings[pos, 2 * i + 1] = np.cos(theta)
+        return embeddings
+
+
+def apply_rotary_position_embeddings(sinusoidal, *tensors):
+    """应用RoPE到tensors中
+    其中，sinusoidal.shape=[b, n, d]，tensors为tensor的列表，而
+    tensor.shape=[b, n, ..., d]。
+    """
+    assert len(tensors) > 0, 'at least one input tensor'
+    assert all([
+        K.int_shape(tensor) == K.int_shape(tensors[0]) for tensor in tensors[1:]
+    ]), 'all tensors must have the same shape'
+    ndim = K.ndim(tensors[0])
+    sinusoidal = align(sinusoidal, [0, 1, -1], ndim)
+    cos_pos = K.repeat_elements(sinusoidal[..., 1::2], 2, -1)
+    sin_pos = K.repeat_elements(sinusoidal[..., ::2], 2, -1)
+    outputs = []
+    for tensor in tensors:
+        tensor2 = K.stack([-tensor[..., 1::2], tensor[..., ::2]], ndim)
+        tensor2 = K.reshape(tensor2, K.shape(tensor))
+        outputs.append(tensor * cos_pos + tensor2 * sin_pos)
+    return outputs[0] if len(outputs) == 1 else outputs
+
+
+def log(x, epsilon=None):
+    """给log添加epsilon，防止NaN
+    """
+    if epsilon is None:
+        return tf.math.log(x)
+    elif epsilon is True:
+        epsilon = K.epsilon()
+    return tf.math.log(K.maximum(x, epsilon))
+
+
+def multilabel_categorical_crossentropy(y_true, y_pred):
+    """多标签分类的交叉熵
+    说明：
+        1. y_true和y_pred的shape一致，y_true的元素是0～1
+           的数，表示当前类是目标类的概率；
+        2. 请保证y_pred的值域是全体实数，换言之一般情况下
+           y_pred不用加激活函数，尤其是不能加sigmoid或者
+           softmax；
+        3. 预测阶段则输出y_pred大于0的类；
+        4. 详情请看：https://kexue.fm/archives/7359 和
+           https://kexue.fm/archives/9064 。
+    """
+    y_mask = y_pred > -K.infinity() / 10
+    n_mask = (y_true < 1 - K.epsilon()) & y_mask
+    p_mask = (y_true > K.epsilon()) & y_mask
+    infs = K.zeros_like(y_pred) + K.infinity()
+    y_neg = K.switch(n_mask, y_pred, -infs) + K.log(1 - y_true, True)
+    y_pos = K.switch(p_mask, -y_pred, -infs) + K.log(y_true, True)
+    zeros = K.zeros_like(y_pred[..., :1])
+    y_neg = K.concatenate([y_neg, zeros], axis=-1)
+    y_pos = K.concatenate([y_pos, zeros], axis=-1)
+    neg_loss = K.logsumexp(y_neg, axis=-1)
+    pos_loss = K.logsumexp(y_pos, axis=-1)
+    return neg_loss + pos_loss
+
+
+def sparse_multilabel_categorical_crossentropy(y_true, y_pred, mask_zero=False):
+    """稀疏版多标签分类的交叉熵
+    说明：
+        1. y_true.shape=[..., num_positive]，
+           y_pred.shape=[..., num_classes]；
+        2. 请保证y_pred的值域是全体实数，换言之一般情况下
+           y_pred不用加激活函数，尤其是不能加sigmoid或者
+           softmax；
+        3. 预测阶段则输出y_pred大于0的类；
+        4. 详情请看：https://kexue.fm/archives/7359 。
+    """
+    zeros = K.zeros_like(y_pred[..., :1])
+    y_pred = K.concatenate([y_pred, zeros], axis=-1)
+    if mask_zero:
+        infs = zeros + K.infinity()
+        y_pred = K.concatenate([infs, y_pred[..., 1:]], axis=-1)
+    y_pos_2 = batch_gather(y_pred, y_true)
+    y_pos_1 = K.concatenate([y_pos_2, zeros], axis=-1)
+    if mask_zero:
+        y_pred = K.concatenate([-infs, y_pred[..., 1:]], axis=-1)
+        y_pos_2 = batch_gather(y_pred, y_true)
+    pos_loss = K.logsumexp(-y_pos_1, axis=-1)
+    all_loss = K.logsumexp(y_pred, axis=-1)
+    aux_loss = K.logsumexp(y_pos_2, axis=-1) - all_loss
+    aux_loss = K.clip(1 - K.exp(aux_loss), K.epsilon(), 1)
+    neg_loss = all_loss + K.log(aux_loss)
+    return pos_loss + neg_loss
+
+
+def symbolic(f):
+    """恒等装饰器（兼容旧版本keras用）
+    """
+    return f
+
+
+def graph_mode_decorator(f, *args, **kwargs):
+    """tf 2.1与之前版本的传参方式不一样，这里做个同步
+    """
+    if tf.__version__ < '2.1':
+        return _graph_mode_decorator(f, *args, **kwargs)
+    else:
+        return _graph_mode_decorator(f, args, kwargs)
+
+
+def recompute_grad(call):
+    """重计算装饰器（用来装饰Keras层的call函数）
+    关于重计算，请参考：https://arxiv.org/abs/1604.06174
+    """
+    if not do_recompute:
+        return call
+
+    def inner(self, inputs, **kwargs):
+        """定义需要求梯度的函数以及重新定义求梯度过程
+        （参考自官方自带的tf.recompute_grad函数）
+        """
+        flat_inputs = nest.flatten(inputs)
+        call_args = tf_inspect.getfullargspec(call).args
+        for key in ['mask', 'training']:
+            if key not in call_args and key in kwargs:
+                del kwargs[key]
+
+        def kernel_call():
+            """定义前向计算
+            """
+            return call(self, inputs, **kwargs)
+
+        def call_and_grad(*inputs):
+            """定义前向计算和反向计算
+            """
+            if is_tf_keras:
+                with tape.stop_recording():
+                    outputs = kernel_call()
+                    outputs = tf.identity(outputs)
+            else:
+                outputs = kernel_call()
+
+            def grad_fn(doutputs, variables=None):
+                watches = list(inputs)
+                if variables is not None:
+                    watches += list(variables)
+                with tf.GradientTape() as t:
+                    t.watch(watches)
+                    with tf.control_dependencies([doutputs]):
+                        outputs = kernel_call()
+                grads = t.gradient(
+                    outputs, watches, output_gradients=[doutputs]
+                )
+                del t
+                return grads[:len(inputs)], grads[len(inputs):]
+
+            return outputs, grad_fn
+
+        if is_tf_keras:  # 仅在tf >= 2.0下可用
+            outputs, grad_fn = call_and_grad(*flat_inputs)
+            flat_outputs = nest.flatten(outputs)
+
+            def actual_grad_fn(*doutputs):
+                grads = grad_fn(*doutputs, variables=self.trainable_weights)
+                return grads[0] + grads[1]
+
+            watches = flat_inputs + self.trainable_weights
+            watches = [tf.convert_to_tensor(x) for x in watches]
+            tape.record_operation(
+                call.__name__, flat_outputs, watches, actual_grad_fn
+            )
+            return outputs
+        else:  # keras + tf >= 1.14 均可用
+            return graph_mode_decorator(call_and_grad, *flat_inputs)
+
+    return inner
+
+
+# 给旧版keras新增symbolic（装饰器），以兼容optimizers.py
+K.symbolic = getattr(K, 'symbolic', None) or symbolic
+
+# 给tf.keras补充上logsumexp
+K.logsumexp = getattr(K, 'logsumexp', None) or tf.math.reduce_logsumexp
+
+# 修改版对数函数
+K.log = log
+
+# 添加到 keras.backend 上，使其可以像 K.epsilon() 那样操作
+K.reshape = reshape
+K.flatten = flatten
+K.infinity = infinity
+K.set_infinity = set_infinity
+sys.modules['tensorflow.keras.backend'] = K
+
+custom_objects = {
+    'gelu_erf': gelu_erf,
+    'gelu_tanh': gelu_tanh,
+    'gelu': gelu_erf,
+    'root_mean_square': root_mean_square,
+    'swish': swish,
+    'leaky_relu': leaky_relu,
+    'Sinusoidal': Sinusoidal,
+    'multilabel_categorical_crossentropy': multilabel_categorical_crossentropy,
+    'initializer': keras.initializers.glorot_uniform,  # 就当是默认初始化方案吧
+}
+
+keras.utils.get_custom_objects().update(custom_objects)
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/layers.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f049cf8819e04291a3b2deb564b3dc99c9020d4
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/layers.py
@@ -0,0 +1,1778 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 自定义层
+
+from npu_bridge.npu_init import *
+import numpy as np
+import tensorflow as tf
+from examples.bertkeras.backend import keras, K, is_tf_keras
+from examples.bertkeras.backend import align, sequence_masking
+from examples.bertkeras.backend import recompute_grad
+from examples.bertkeras.backend import attention_normalize
+from examples.bertkeras.backend import apply_rotary_position_embeddings
+from keras import initializers, activations
+from keras.layers import *
+from npu_bridge.estimator.npu_aicore_ops import layer_norm
+
+
+
+from tensorflow.python.keras import backend
+from tensorflow.python.keras.utils import tf_utils
+#from tensorflow.python.keras.layers.core import Dropout
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from npu_bridge.estimator import npu_ops
+
+
+def dropout_call(self, inputs, training=None):
+    """Make Keras Dropout to execute NPU dropout"""
+    training=tf.constant(True, dtype=tf.bool)
+    
+    if self._get_noise_shape(inputs):
+        def dropped_inputs():
+            return nn.dropout(
+                inputs,
+                noise_shape=self._get_noise_shape(inputs),
+                seed=self.seed,
+                rate=self.rate)
+    else:
+        def dropped_inputs():
+            return npu_ops.dropout(
+                inputs,
+                noise_shape=self._get_noise_shape(inputs),
+                seed=self.seed,
+                keep_prob=1.0 - self.rate)
+
+    output = tf_utils.smart_cond(training,
+                                 dropped_inputs,
+                                 lambda: array_ops.identity(inputs))
+    return output
+
+
+Dropout.call = dropout_call
+
+
+
+
+def integerize_shape(func):
+    """装饰器，保证input_shape一定是int或None
+    """
+    def convert(item):
+        if hasattr(item, '__iter__'):
+            return [convert(i) for i in item]
+        elif hasattr(item, 'value'):
+            return item.value
+        else:
+            return item
+
+    def new_func(self, input_shape):
+        input_shape = convert(input_shape)
+        return func(self, input_shape)
+
+    return new_func
+
+
+if (not is_tf_keras) and keras.__version__ < '2.3':
+
+    class Layer(keras.layers.Layer):
+        """重新定义Layer，赋予“层中层”功能
+        （仅keras 2.3以下版本需要）
+        """
+        def __init__(self, **kwargs):
+            super(Layer, self).__init__(**kwargs)
+            self.supports_masking = True  # 本项目的自定义层均可mask
+
+        def __setattr__(self, name, value):
+            if isinstance(value, keras.layers.Layer):
+                if not hasattr(self, '_layers'):
+                    self._layers = []
+                if value not in self._layers:
+                    self._layers.append(value)
+            super(Layer, self).__setattr__(name, value)
+
+        @property
+        def trainable_weights(self):
+            trainable = getattr(self, 'trainable', True)
+            if trainable:
+                trainable_weights = super(Layer, self).trainable_weights[:]
+                for l in getattr(self, '_layers', []):
+                    trainable_weights += l.trainable_weights
+                return trainable_weights
+            else:
+                return []
+
+        @property
+        def non_trainable_weights(self):
+            trainable = getattr(self, 'trainable', True)
+            non_trainable_weights = super(Layer, self).non_trainable_weights[:]
+            for l in getattr(self, '_layers', []):
+                if trainable:
+                    non_trainable_weights += l.non_trainable_weights
+                else:
+                    non_trainable_weights += l.weights
+            return non_trainable_weights
+
+    if keras.__version__ < '2.2.5':
+
+        import inspect
+
+        class Model(keras.models.Model):
+            """重新定义Model，整合fit和fit_generator
+            """
+            def fit(self, x=None, *args, **kwargs):
+                if inspect.isgenerator(x):
+                    return self.fit_generator(x, *args, **kwargs)
+                else:
+                    return super(Model, self).fit(x, *args, **kwargs)
+
+        keras.models.Model = Model
+
+else:
+
+    class Layer(keras.layers.Layer):
+        def __init__(self, **kwargs):
+            super(Layer, self).__init__(**kwargs)
+            self.supports_masking = True  # 本项目的自定义层均可mask
+
+
+if (not is_tf_keras) or tf.__version__ < '1.15':
+    
+    if not is_tf_keras:
+        NodeBase = keras.engine.base_layer.Node
+    else:
+        from tensorflow.python.keras.engine import base_layer
+        NodeBase = base_layer.Node
+
+    class Node(NodeBase):
+        """修改Node来修复keras下孪生网络的bug
+        注意：这是keras的bug，并不是bert4keras的bug，但keras已经不更新了，
+             所以只好在这里进行修改。tf 1.15+自带的keras已经修改了这个
+             bug。
+        """
+        @property
+        def arguments(self):
+            return self._arguments.copy()
+
+        @arguments.setter
+        def arguments(self, value):
+            self._arguments = value or {}
+
+    if not is_tf_keras:
+        keras.engine.base_layer.Node = Node
+    else:
+        base_layer.Node = Node
+
+
+class GlobalAveragePooling1D(keras.layers.GlobalAveragePooling1D):
+    """重新定义GlobalAveragePooling1D，支持序列长度为None
+    """
+    def call(self, inputs, mask=None):
+        axis = 1 if self.data_format == 'channels_last' else 2
+        if mask is not None:
+            mask = K.cast(mask, K.floatx())
+            mask = mask[..., None] if axis == 1 else mask[:, None]
+            return K.sum(inputs * mask, axis=axis) / K.sum(mask, axis=axis)
+        else:
+            return K.mean(inputs, axis=axis)
+
+
+class GlobalMaxPooling1D(keras.layers.GlobalMaxPooling1D):
+    """重新定义GlobalMaxPooling1D，支持mask
+    """
+    def __init__(self, data_format='channels_last', **kwargs):
+        super(GlobalMaxPooling1D, self).__init__(data_format, **kwargs)
+        self.supports_masking = True
+
+    def call(self, inputs, mask=None):
+        axis = 1 if self.data_format == 'channels_last' else 2
+        inputs = sequence_masking(inputs, mask, '-inf', axis)
+        return K.max(inputs, axis=axis)
+
+    def compute_mask(self, inputs, mask=None):
+        return None
+
+
+# 直接覆盖原对象
+keras.layers.GlobalAveragePooling1D = GlobalAveragePooling1D
+keras.layers.GlobalMaxPooling1D = GlobalMaxPooling1D
+
+
+class Embedding(keras.layers.Embedding):
+    """拓展Embedding层
+    """
+    def compute_mask(self, inputs, mask=None):
+        """为了适配T5，保证第一个token不被mask
+        """
+        if K.ndim(inputs) == 2:
+            mask = super(Embedding, self).compute_mask(inputs, mask)
+            if mask is not None:
+                mask1 = K.ones_like(mask[:, :1], dtype='bool')
+                mask2 = mask[:, 1:]
+                return K.concatenate([mask1, mask2], 1)
+        else:
+            return mask
+
+    def call(self, inputs, mode='embedding'):
+        """新增mode参数，可以为embedding或dense。如果为embedding，
+        则等价于普通Embedding层；如果为dense，则等价于无bias的Dense层。
+        """
+        if mode == 'embedding':
+            return super(Embedding, self).call(inputs)
+        else:
+            kernel = K.transpose(self.embeddings)
+            return K.dot(inputs, kernel)
+
+    def compute_output_shape(self, input_shape):
+        """关于判据，本来是通过缓存call时的mode参数来判断的，但是后来发现
+        Keras在使用compute_output_shape的时候不一定配套调用了call函数，
+        所以缓存的mode可能是不准的，因此只能出此下策。
+        """
+        if len(input_shape) == 2:
+            return super(Embedding, self).compute_output_shape(input_shape)
+        else:
+            return input_shape[:2] + (K.int_shape(self.embeddings)[0],)
+
+
+class ScaleOffset(Layer):
+    """简单的仿射变换层（最后一维乘上gamma向量并加上beta向量）
+    说明：1、具体操作为最后一维乘上gamma向量并加上beta向量；
+         2、如果直接指定scale和offset，那么直接常数缩放和平移；
+         3、hidden_*系列参数仅为有条件输入时(conditional=True)使用，
+            用于通过外部条件控制beta和gamma。
+    """
+    def __init__(
+        self,
+        scale=True,
+        offset=True,
+        conditional=False,
+        hidden_units=None,
+        hidden_activation='linear',
+        hidden_initializer='glorot_uniform',
+        **kwargs
+    ):
+        super(ScaleOffset, self).__init__(**kwargs)
+        self.scale = scale
+        self.offset = offset
+        self.conditional = conditional
+        self.hidden_units = hidden_units
+        self.hidden_activation = activations.get(hidden_activation)
+        self.hidden_initializer = initializers.get(hidden_initializer)
+
+    @integerize_shape
+    def build(self, input_shape):
+        super(ScaleOffset, self).build(input_shape)
+
+        if self.conditional:
+            input_shape = input_shape[0]
+
+        if self.offset is True:
+            self.beta = self.add_weight(
+                name='beta', shape=(input_shape[-1],), initializer='zeros'
+            )
+        if self.scale is True:
+            self.gamma = self.add_weight(
+                name='gamma', shape=(input_shape[-1],), initializer='ones'
+            )
+
+        if self.conditional:
+
+            if self.hidden_units is not None:
+                self.hidden_dense = Dense(
+                    units=self.hidden_units,
+                    activation=self.hidden_activation,
+                    use_bias=False,
+                    kernel_initializer=self.hidden_initializer
+                )
+
+            if self.offset is not False and self.offset is not None:
+                self.beta_dense = Dense(
+                    units=input_shape[-1],
+                    use_bias=False,
+                    kernel_initializer='zeros'
+                )
+            if self.scale is not False and self.scale is not None:
+                self.gamma_dense = Dense(
+                    units=input_shape[-1],
+                    use_bias=False,
+                    kernel_initializer='zeros'
+                )
+
+    def compute_mask(self, inputs, mask=None):
+        if self.conditional:
+            return mask if mask is None else mask[0]
+        else:
+            return mask
+
+    @recompute_grad
+    def call(self, inputs):
+        """如果带有条件，则默认以list为输入，第二个是条件
+        """
+        if self.conditional:
+            inputs, conds = inputs
+            if self.hidden_units is not None:
+                conds = self.hidden_dense(conds)
+            conds = align(conds, [0, -1], K.ndim(inputs))
+
+        if self.scale is not False and self.scale is not None:
+            gamma = self.gamma if self.scale is True else self.scale
+            if self.conditional:
+                gamma = gamma + self.gamma_dense(conds)
+            inputs = inputs * gamma
+
+        if self.offset is not False and self.offset is not None:
+            beta = self.beta if self.offset is True else self.offset
+            if self.conditional:
+                beta = beta + self.beta_dense(conds)
+            inputs = inputs + beta
+
+        return inputs
+
+    def compute_output_shape(self, input_shape):
+        if self.conditional:
+            return input_shape[0]
+        else:
+            return input_shape
+
+    def get_config(self):
+        config = {
+            'scale': self.scale,
+            'offset': self.offset,
+            'conditional': self.conditional,
+            'hidden_units': self.hidden_units,
+            'hidden_activation': activations.serialize(self.hidden_activation),
+            'hidden_initializer':
+                initializers.serialize(self.hidden_initializer),
+        }
+        base_config = super(ScaleOffset, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+#LN
+class LNScaleOffset(Layer):
+    """简单的仿射变换层（最后一维乘上gamma向量并加上beta向量）
+    说明：1、具体操作为最后一维乘上gamma向量并加上beta向量；
+         2、如果直接指定scale和offset，那么直接常数缩放和平移；
+         3、hidden_*系列参数仅为有条件输入时(conditional=True)使用，
+            用于通过外部条件控制beta和gamma。
+    """
+    def __init__(
+        self,
+        scale=True,
+        offset=True,
+        conditional=False,
+        hidden_units=None,
+        hidden_activation='linear',
+        hidden_initializer='glorot_uniform',
+        **kwargs
+    ):
+        super(LNScaleOffset, self).__init__(**kwargs)
+        self.scale = scale
+        self.offset = offset
+        self.conditional = conditional
+        self.hidden_units = hidden_units
+        self.hidden_activation = activations.get(hidden_activation)
+        self.hidden_initializer = initializers.get(hidden_initializer)
+
+    @integerize_shape
+    def build(self, input_shape):
+        super(LNScaleOffset, self).build(input_shape)
+
+        if self.conditional:
+            input_shape = input_shape[0]
+
+        if self.offset is True:
+            self.beta = self.add_weight(
+                name='beta', shape=(input_shape[-1],), initializer='zeros'
+            )
+        if self.scale is True:
+            self.gamma = self.add_weight(
+                name='gamma', shape=(input_shape[-1],), initializer='ones'
+            )
+
+        if self.conditional:
+
+            if self.hidden_units is not None:
+                self.hidden_dense = Dense(
+                    units=self.hidden_units,
+                    activation=self.hidden_activation,
+                    use_bias=False,
+                    kernel_initializer=self.hidden_initializer
+                )
+
+            if self.offset is not False and self.offset is not None:
+                self.beta_dense = Dense(
+                    units=input_shape[-1],
+                    use_bias=False,
+                    kernel_initializer='zeros'
+                )
+            if self.scale is not False and self.scale is not None:
+                self.gamma_dense = Dense(
+                    units=input_shape[-1],
+                    use_bias=False,
+                    kernel_initializer='zeros'
+                )
+
+    def compute_mask(self, inputs, mask=None):
+        if self.conditional:
+            return mask if mask is None else mask[0]
+        else:
+            return mask
+
+    @recompute_grad
+    def call(self, inputs, epsilon, begin_norm_axis, begin_params_axis):
+        """如果带有条件，则默认以list为输入，第二个是条件
+        """
+        if self.conditional:
+            inputs, conds = inputs
+            if self.hidden_units is not None:
+                conds = self.hidden_dense(conds)
+            conds = align(conds, [0, -1], K.ndim(inputs))
+
+        if self.scale is not False and self.scale is not None:
+            gamma = self.gamma if self.scale is True else self.scale
+            if self.conditional:
+                gamma = gamma + self.gamma_dense(conds)
+            # inputs = inputs * gamma
+
+        if self.offset is not False and self.offset is not None:
+            beta = self.beta if self.offset is True else self.offset
+            if self.conditional:
+                beta = beta + self.beta_dense(conds)
+            # inputs = inputs + beta
+
+        return layer_norm(inputs, gamma, beta, epsilon = epsilon, 
+                          begin_norm_axis=begin_norm_axis, 
+                          begin_params_axis=begin_params_axis)[0]
+
+    def compute_output_shape(self, input_shape):
+        if self.conditional:
+            return input_shape[0]
+        else:
+            return input_shape
+
+    def get_config(self):
+        config = {
+            'scale': self.scale,
+            'offset': self.offset,
+            'conditional': self.conditional,
+            'hidden_units': self.hidden_units,
+            'hidden_activation': activations.serialize(self.hidden_activation),
+            'hidden_initializer':
+                initializers.serialize(self.hidden_initializer),
+        }
+        base_config = super(LNScaleOffset, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+
+
+class Concatenate1D(Layer):
+    """1维序列拼接层
+    说明：本来该功能可以直接通过Concatenate层来实现，无奈Keras
+         自带的Concatenate层的compute_mask写得不合理，导致一个
+         带mask的序列与一个不带mask的序列拼接会报错，因此干脆
+         自己重写一个好了。
+    """
+    def call(self, inputs):
+        return K.concatenate(inputs, axis=1)
+
+    def compute_mask(self, inputs, mask=None):
+        if mask is not None:
+            masks = []
+            for i, m in enumerate(mask):
+                if m is None:
+                    m = K.ones_like(inputs[i][..., 0], dtype='bool')
+                masks.append(m)
+            return K.concatenate(masks, axis=1)
+
+    def compute_output_shape(self, input_shape):
+        if all([shape[1] for shape in input_shape]):
+            seq_len = sum([shape[1] for shape in input_shape])
+            return (input_shape[0][0], seq_len, input_shape[0][2])
+        else:
+            return (input_shape[0][0], None, input_shape[0][2])
+
+
+class BatchSplit(Layer):
+    """将第一维进行分割
+    主要是用于自行实现多卡数据并行。
+    """
+    def __init__(self, parts, **kwargs):
+        super(BatchSplit, self).__init__(**kwargs)
+        self.parts = parts
+
+    def compute_mask(self, inputs, mask=None):
+        if isinstance(mask, list):
+            return [o for i in mask for o in self.compute_mask(inputs, i)]
+
+        if mask is not None:
+            return self.call(mask)
+        elif np.ndim(self.parts) > 0:
+            return [None] * len(self.parts)
+        else:
+            return [None] * self.parts
+
+    def call(self, inputs):
+        if isinstance(inputs, list):
+            return [o for i in inputs for o in self.call(i)]
+
+        outputs = []
+
+        batch_size = K.shape(inputs)[0]
+        if np.ndim(self.parts) > 0:
+            batch_size = K.cast(batch_size, 'float64')
+            slices = [
+                K.cast(p * batch_size / sum(self.parts), 'int32')
+                for p in np.cumsum(self.parts).astype('float64')
+            ]
+        else:
+            stride = K.cast(
+                tf.math.ceil(batch_size / self.parts), K.dtype(batch_size)
+            )
+            slices = [stride * (i + 1) for i in range(self.parts)]
+
+        for i, _ in enumerate(slices):
+            if i == 0:
+                outputs.append(inputs[:slices[0]])
+            elif i == len(slices) - 1:
+                outputs.append(inputs[slices[-2]:])
+            else:
+                outputs.append(inputs[slices[i - 1]:slices[i]])
+
+        return outputs
+
+    def compute_output_shape(self, input_shape):
+        if isinstance(input_shape, list):
+            return [
+                o for i in input_shape for o in self.compute_output_shape(i)
+            ]
+
+        if np.ndim(self.parts) > 0:
+            return [input_shape] * len(self.parts)
+        else:
+            return [input_shape] * self.parts
+
+    def get_config(self):
+        config = {
+            'parts': self.parts,
+        }
+        base_config = super(BatchSplit, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class BatchConcat(Layer):
+    """将第一维进行合并
+    主要是用于自行实现多卡数据并行。
+    """
+    def compute_mask(self, inputs, mask=None):
+        if isinstance(mask, list):
+            if all([m is not None for m in mask]):
+                return K.concatenate(mask, 0)
+
+    def call(self, inputs):
+        return K.concatenate(inputs, 0)
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[0]
+
+
+class MultiHeadAttention(Layer):
+    """多头注意力机制
+    """
+    def __init__(
+        self,
+        heads,
+        head_size,
+        out_dim=None,
+        key_size=None,
+        use_bias=True,
+        normalization='softmax',
+        attention_scale=True,
+        attention_dropout=None,
+        return_attention_scores=False,
+        kernel_initializer='glorot_uniform',
+        **kwargs
+    ):
+        super(MultiHeadAttention, self).__init__(**kwargs)
+        self.heads = heads
+        self.head_size = head_size
+        self.out_dim = out_dim or heads * head_size
+        self.key_size = key_size or head_size
+        self.use_bias = use_bias
+        self.normalization = normalization
+        self.attention_scale = attention_scale
+        self.attention_dropout = attention_dropout
+        self.return_attention_scores = return_attention_scores
+        self.kernel_initializer = initializers.get(kernel_initializer)
+
+    def build(self, input_shape):
+        super(MultiHeadAttention, self).build(input_shape)
+        self.q_dense = Dense(
+            units=self.key_size * self.heads,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.k_dense = Dense(
+            units=self.key_size * self.heads,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.v_dense = Dense(
+            units=self.head_size * self.heads,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.o_dense = Dense(
+            units=self.out_dim,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+
+    @recompute_grad
+    def call(self, inputs, mask=None, **kwargs):
+        """实现多头注意力
+        q_mask: 对输入的query序列的mask。
+                主要是将输出结果的padding部分置0。
+        v_mask: 对输入的value序列的mask。
+                主要是防止attention读取到padding信息。
+        """
+        q, k, v = inputs[:3]
+        q_mask, v_mask = None, None
+        if mask is not None:
+            q_mask, v_mask = mask[0], mask[2]
+        # 线性变换
+        qw = self.q_dense(q)
+        kw = self.k_dense(k)
+        vw = self.v_dense(v)
+        # 形状变换
+        qw = K.reshape(qw, (self.heads, self.key_size), -1)
+        kw = K.reshape(kw, (self.heads, self.key_size), -1)
+        vw = K.reshape(vw, (self.heads, self.head_size), -1)
+        # Attention
+        qkv_inputs = [qw, kw, vw] + inputs[3:]
+        qv_masks = [q_mask, v_mask]
+        o, a = self.pay_attention_to(qkv_inputs, qv_masks, **kwargs)
+        # 完成输出
+        o = self.o_dense(K.flatten(o, 2))
+        # 返回结果
+        if self.return_attention_scores:
+            return [o, a]
+        else:
+            return o
+
+    def pay_attention_to(self, inputs, mask=None, **kwargs):
+        """实现标准的乘性多头注意力
+        a_bias: 对attention矩阵的bias。
+                不同的attention bias对应不同的应用。
+        p_bias: 在attention里的位置偏置。
+                一般用来指定相对位置编码的种类。
+        说明: 这里单独分离出pay_attention_to函数，是为了方便
+              继承此类来定义不同形式的attention；此处要求
+              返回o.shape=(batch_size, seq_len, heads, head_size)。
+        """
+        (qw, kw, vw), n = inputs[:3], 3
+        q_mask, v_mask = mask
+        a_bias, p_bias = kwargs.get('a_bias'), kwargs.get('p_bias')
+        if a_bias:
+            a_bias = inputs[n]
+            n += 1
+        if p_bias == 'rotary':
+            qw, kw = apply_rotary_position_embeddings(inputs[n], qw, kw)
+        # Attention
+        a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
+        # 处理位置编码
+        if p_bias == 'typical_relative':
+            position_bias = inputs[n]
+            a = a + tf.einsum('bjhd,jkd->bhjk', qw, position_bias)
+        elif p_bias == 't5_relative':
+            position_bias = K.permute_dimensions(inputs[n], (2, 0, 1))
+            a = a + K.expand_dims(position_bias, 0)
+        # Attention（续）
+        if self.attention_scale:
+            a = a / self.key_size**0.5
+        if a_bias is not None:
+            if K.ndim(a_bias) == 3:
+                a_bias = align(a_bias, [0, -2, -1], K.ndim(a))
+            a = a + a_bias
+        a = sequence_masking(a, v_mask, '-inf', -1)
+        A = attention_normalize(a, -1, self.normalization)
+        if self.attention_dropout:
+            #def output(x):
+            #    print("1========")
+            #    return npu_ops.dropout(x)(A)
+            #    print("11========")
+            #A = keras.layers.Lambda(output)(1-self.attention_dropout)
+            #import pdb
+            #pdb.set_trace()
+            A = Dropout(self.attention_dropout)(A)
+            #A = npu_ops.dropout(A,keep_prob=1.0-self.attention_dropout)
+        # 完成输出
+        o = tf.einsum('bhjk,bkhd->bjhd', A, vw)
+        if p_bias == 'typical_relative':
+            o = o + tf.einsum('bhjk,jkd->bjhd', A, position_bias)
+        return o, a
+
+
+    def compute_output_shape(self, input_shape):
+        o_shape = (input_shape[0][0], input_shape[0][1], self.out_dim)
+        if self.return_attention_scores:
+            a_shape = (
+                input_shape[0][0], self.heads, input_shape[0][1],
+                input_shape[1][1]
+            )
+            return [o_shape, a_shape]
+        else:
+            return o_shape
+
+    def compute_mask(self, inputs, mask=None):
+        if mask is not None:
+            if self.return_attention_scores:
+                return [mask[0], None]
+            else:
+                return mask[0]
+
+    def get_config(self):
+        config = {
+            'heads': self.heads,
+            'head_size': self.head_size,
+            'out_dim': self.out_dim,
+            'key_size': self.key_size,
+            'use_bias': self.use_bias,
+            'normalization': self.normalization,
+            'attention_scale': self.attention_scale,
+            'attention_dropout': self.attention_dropout,
+            'return_attention_scores': self.return_attention_scores,
+            'kernel_initializer':
+                initializers.serialize(self.kernel_initializer),
+        }
+        base_config = super(MultiHeadAttention, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class GatedAttentionUnit(Layer):
+    """门控注意力单元
+    链接：https://arxiv.org/abs/2202.10447
+    介绍：https://kexue.fm/archives/8934
+    说明：没有加入加性相对位置编码，个人认为是不必要的；如果觉得有必要，
+         可以自行通过a_bias传入。
+    """
+    def __init__(
+        self,
+        units,
+        key_size,
+        activation='swish',
+        use_bias=True,
+        normalization='squared_relu',
+        attention_scale=True,
+        attention_dropout=None,
+        kernel_initializer='glorot_uniform',
+        **kwargs
+    ):
+        super(GatedAttentionUnit, self).__init__(**kwargs)
+        self.units = units
+        self.key_size = key_size
+        self.activation = activations.get(activation)
+        self.use_bias = use_bias
+        self.normalization = normalization
+        self.attention_scale = attention_scale
+        self.attention_dropout = attention_dropout
+        self.kernel_initializer = initializers.get(kernel_initializer)
+
+    @integerize_shape
+    def build(self, input_shape):
+        super(GatedAttentionUnit, self).build(input_shape)
+        hidden_size = input_shape[-1]
+        if isinstance(hidden_size, (list, tuple)):
+            hidden_size = input_shape[0][-1]
+        self.i_dense = Dense(
+            units=2 * self.units + self.key_size,
+            activation=self.activation,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.o_dense = Dense(
+            units=hidden_size,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.q_scaleoffset = ScaleOffset(offset=self.use_bias)
+        self.k_scaleoffset = ScaleOffset(offset=self.use_bias)
+
+    @recompute_grad
+    def call(self, inputs, mask=None, a_bias=None, p_bias=None):
+        if not isinstance(inputs, list):
+            inputs, mask = [inputs], [mask]
+        x, n = inputs[0], 1
+        mask = None if mask is None else mask[0]
+        if a_bias:
+            a_bias = inputs[n]
+            n += 1
+        # 投影变换
+        x = self.i_dense(x)
+        u, v, qk = tf.split(x, [self.units, self.units, self.key_size], axis=-1)
+        q, k = self.q_scaleoffset(qk), self.k_scaleoffset(qk)
+        # 加入RoPE
+        if p_bias == 'rotary':
+            q, k = apply_rotary_position_embeddings(inputs[n], q, k)
+        # Attention
+        a = tf.einsum('bmd,bnd->bmn', q, k)
+        if self.attention_scale:
+            a = a / self.key_size**0.5
+        if a_bias is not None:
+            a = a + a_bias
+        a = sequence_masking(a, mask, '-inf', -1)
+        A = attention_normalize(a, -1, self.normalization)
+        if self.attention_dropout:
+            #def output(x):
+             #   return npu_ops.dropout(x)(A)
+            #A = keras.layers.Lambda(output)(self.attention_dropout)
+            A = Dropout(self.attention_dropout)(A)
+            #A = npu_ops.dropout(A,keep_prob=1.0-self.attention_dropout)
+        # 计算输出
+        o = self.o_dense(u * tf.einsum('bmn,bnd->bmd', A, v))
+        return o
+
+    def compute_mask(self, inputs, mask=None):
+        if isinstance(mask, list):
+            return mask[0]
+        else:
+            return mask
+
+    def compute_output_shape(self, input_shape):
+        if isinstance(input_shape[0], (list, tuple)):
+            return input_shape[0]
+        else:
+            return input_shape
+
+    def get_config(self):
+        config = {
+            'units': self.units,
+            'key_size': self.key_size,
+            'activation': activations.serialize(self.activation),
+            'use_bias': self.use_bias,
+            'normalization': self.normalization,
+            'attention_scale': self.attention_scale,
+            'attention_dropout': self.attention_dropout,
+            'kernel_initializer':
+                initializers.serialize(self.kernel_initializer),
+        }
+        base_config = super(GatedAttentionUnit, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class LayerNormalization(ScaleOffset):
+    """(Conditional) Layer Normalization
+    """
+    def __init__(
+        self, zero_mean=True, unit_variance=True, epsilon=None, **kwargs
+    ):
+        super(LayerNormalization, self).__init__(**kwargs)
+        self.zero_mean = zero_mean
+        self.unit_variance = unit_variance
+        self.epsilon = epsilon or K.epsilon()
+
+    @recompute_grad
+    def call(self, inputs):
+        """如果是条件Layer Norm，则默认以list为输入，第二个是条件
+        """
+        if self.conditional:
+            inputs, conds = inputs
+
+        if self.zero_mean:
+            mean = K.mean(inputs, axis=-1, keepdims=True)
+            inputs = inputs - mean
+        if self.unit_variance:
+            variance = K.mean(K.square(inputs), axis=-1, keepdims=True)
+            inputs = inputs / K.sqrt(variance + self.epsilon)
+
+        if self.conditional:
+            inputs = [inputs, conds]
+
+        return super(LayerNormalization, self).call(inputs)
+
+    def get_config(self):
+        config = {
+            'zero_mean': self.zero_mean,
+            'unit_variance': self.unit_variance,
+            'epsilon': self.epsilon,
+        }
+        base_config = super(LayerNormalization, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+
+class LN(LNScaleOffset):
+    """Modified Layer Normalization
+    """
+    def __init__(
+        self, epsilon=1e-12, begin_norm_axis=-1, begin_params_axis=-1, **kwargs
+    ):
+        super(LN, self).__init__(**kwargs)
+        self.epsilon = epsilon
+        self.begin_norm_axis = begin_norm_axis
+        self.begin_params_axis = begin_params_axis
+
+    @recompute_grad
+    def call(self, inputs):
+        
+        
+        return super(LN, self).call(inputs, self.epsilon, self.begin_norm_axis, self.begin_params_axis)
+        # return layer_norm(
+        #            inputs, gamma=self.gamma, beta=self.beta, 
+        #            begin_norm_axis=self.begin_norm_axis, 
+        #            begin_params_axis=self.begin_params_axis)[0]
+
+    def get_config(self):
+        config = {
+            'epsilon': self.epsilon,
+            'begin_norm_axis' : self.begin_norm_axis,
+            'begin_params_axis' : self.begin_params_axis
+        }
+        base_config = super(LN, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+
+
+class PositionEmbedding(Layer):
+    """定义可训练的位置Embedding
+    """
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        merge_mode='add',
+        hierarchical=None,
+        embeddings_initializer='zeros',
+        custom_position_ids=False,
+        **kwargs
+    ):
+        super(PositionEmbedding, self).__init__(**kwargs)
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.merge_mode = merge_mode
+        self.hierarchical = hierarchical
+        self.embeddings_initializer = initializers.get(embeddings_initializer)
+        self.custom_position_ids = custom_position_ids
+
+    def build(self, input_shape):
+        super(PositionEmbedding, self).build(input_shape)
+        self.embeddings = self.add_weight(
+            name='embeddings',
+            shape=(self.input_dim, self.output_dim),
+            initializer=self.embeddings_initializer
+        )
+
+    def call(self, inputs):
+        """如果custom_position_ids，那么第二个输入为自定义的位置id
+        """
+        if self.custom_position_ids:
+            inputs, position_ids = inputs
+            if 'int' not in K.dtype(position_ids):
+                position_ids = K.cast(position_ids, 'int32')
+        else:
+            input_shape = K.shape(inputs)
+            batch_size, seq_len = input_shape[0], input_shape[1]
+            position_ids = K.arange(0, seq_len, dtype='int32')[None]
+
+        if self.hierarchical:
+            alpha = 0.4 if self.hierarchical is True else self.hierarchical
+            embeddings = self.embeddings - alpha * self.embeddings[:1]
+            embeddings = embeddings / (1 - alpha)
+            embeddings_x = K.gather(embeddings, position_ids // self.input_dim)
+            embeddings_y = K.gather(embeddings, position_ids % self.input_dim)
+            embeddings = alpha * embeddings_x + (1 - alpha) * embeddings_y
+        else:
+            if self.custom_position_ids:
+                embeddings = K.gather(self.embeddings, position_ids)
+            else:
+                embeddings = self.embeddings[None, :seq_len]
+
+        if self.merge_mode == 'add':
+            return inputs + embeddings
+        elif self.merge_mode == 'mul':
+            return inputs * (embeddings + 1.0)
+        elif self.merge_mode == 'zero':
+            return embeddings
+        else:
+            if not self.custom_position_ids:
+                embeddings = K.tile(embeddings, [batch_size, 1, 1])
+            return K.concatenate([inputs, embeddings])
+
+    def compute_output_shape(self, input_shape):
+        if self.custom_position_ids:
+            input_shape = input_shape[0]
+
+        if self.merge_mode in ['add', 'mul', 'zero']:
+            return input_shape[:2] + (self.output_dim,)
+        else:
+            return input_shape[:2] + (input_shape[2] + self.output_dim,)
+
+    def get_config(self):
+        config = {
+            'input_dim': self.input_dim,
+            'output_dim': self.output_dim,
+            'merge_mode': self.merge_mode,
+            'hierarchical': self.hierarchical,
+            'embeddings_initializer':
+                initializers.serialize(self.embeddings_initializer),
+            'custom_position_ids': self.custom_position_ids,
+        }
+        base_config = super(PositionEmbedding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class SinusoidalPositionEmbedding(Layer):
+    """定义Sin-Cos位置Embedding
+    """
+    def __init__(
+        self,
+        output_dim,
+        merge_mode='add',
+        custom_position_ids=False,
+        **kwargs
+    ):
+        super(SinusoidalPositionEmbedding, self).__init__(**kwargs)
+        self.output_dim = output_dim
+        self.merge_mode = merge_mode
+        self.custom_position_ids = custom_position_ids
+
+    def call(self, inputs):
+        """如果custom_position_ids，那么第二个输入为自定义的位置id
+        """
+        if self.custom_position_ids:
+            inputs, position_ids = inputs
+            if 'float' not in K.dtype(position_ids):
+                position_ids = K.cast(position_ids, K.floatx())
+        else:
+            input_shape = K.shape(inputs)
+            batch_size, seq_len = input_shape[0], input_shape[1]
+            position_ids = K.arange(0, seq_len, dtype=K.floatx())[None]
+
+        indices = K.arange(0, self.output_dim // 2, dtype=K.floatx())
+        indices = K.pow(10000.0, -2 * indices / self.output_dim)
+        embeddings = tf.einsum('bn,d->bnd', position_ids, indices)
+        embeddings = K.stack([K.sin(embeddings), K.cos(embeddings)], axis=-1)
+        embeddings = K.flatten(embeddings, 2)
+
+        if self.merge_mode == 'add':
+            return inputs + embeddings
+        elif self.merge_mode == 'mul':
+            return inputs * (embeddings + 1.0)
+        elif self.merge_mode == 'zero':
+            return embeddings
+        else:
+            if not self.custom_position_ids:
+                embeddings = K.tile(embeddings, [batch_size, 1, 1])
+            return K.concatenate([inputs, embeddings])
+
+    def compute_output_shape(self, input_shape):
+        if self.custom_position_ids:
+            input_shape = input_shape[0]
+
+        if self.merge_mode in ['add', 'mul', 'zero']:
+            return input_shape[:2] + (self.output_dim,)
+        else:
+            return input_shape[:2] + (input_shape[2] + self.output_dim,)
+
+    def get_config(self):
+        config = {
+            'output_dim': self.output_dim,
+            'merge_mode': self.merge_mode,
+            'custom_position_ids': self.custom_position_ids,
+        }
+        base_config = super(SinusoidalPositionEmbedding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class RelativePositionEmbedding(Layer):
+    """相对位置编码
+    来自论文：https://arxiv.org/abs/1803.02155
+    """
+    def __init__(
+        self, input_dim, output_dim, embeddings_initializer='zeros', **kwargs
+    ):
+        super(RelativePositionEmbedding, self).__init__(**kwargs)
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.embeddings_initializer = initializers.get(embeddings_initializer)
+
+    def build(self, input_shape):
+        super(RelativePositionEmbedding, self).build(input_shape)
+        self.embeddings = self.add_weight(
+            name='embeddings',
+            shape=(self.input_dim, self.output_dim),
+            initializer=self.embeddings_initializer,
+        )
+
+    def call(self, inputs):
+        pos_ids = self.compute_position_ids(inputs)
+        return K.gather(self.embeddings, pos_ids)
+
+    def compute_position_ids(self, inputs):
+        q, v = inputs
+        # 计算位置差
+        q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
+        q_idxs = K.expand_dims(q_idxs, 1)
+        v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
+        v_idxs = K.expand_dims(v_idxs, 0)
+        pos_ids = v_idxs - q_idxs
+        # 后处理操作
+        max_position = (self.input_dim - 1) // 2
+        pos_ids = K.clip(pos_ids, -max_position, max_position)
+        pos_ids = pos_ids + max_position
+        return pos_ids
+
+    def compute_output_shape(self, input_shape):
+        return (None, None, self.output_dim)
+
+    def compute_mask(self, inputs, mask):
+        return mask[0]
+
+    def get_config(self):
+        config = {
+            'input_dim': self.input_dim,
+            'output_dim': self.output_dim,
+            'embeddings_initializer':
+                initializers.serialize(self.embeddings_initializer),
+        }
+        base_config = super(RelativePositionEmbedding, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class RelativePositionEmbeddingT5(RelativePositionEmbedding):
+    """Google T5的相对位置编码
+    来自论文：https://arxiv.org/abs/1910.10683
+    """
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        max_distance=128,
+        bidirectional=True,
+        embeddings_initializer='zeros',
+        **kwargs
+    ):
+        super(RelativePositionEmbeddingT5,
+              self).__init__(input_dim, output_dim, **kwargs)
+        self.max_distance = max_distance
+        self.bidirectional = bidirectional
+
+    def compute_position_ids(self, inputs):
+        """T5的相对位置分桶（直接翻译自官方T5源码）
+        """
+        q, v = inputs
+        # 计算位置差
+        q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
+        q_idxs = K.expand_dims(q_idxs, 1)
+        v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
+        v_idxs = K.expand_dims(v_idxs, 0)
+        pos_ids = v_idxs - q_idxs
+        # 后处理操作
+        num_buckets, max_distance = self.input_dim, self.max_distance
+        ret = 0
+        n = -pos_ids
+        if self.bidirectional:
+            num_buckets //= 2
+            ret += K.cast(K.less(n, 0), 'int32') * num_buckets
+            n = K.abs(n)
+        else:
+            n = K.maximum(n, 0)
+        # now n is in the range [0, inf)
+        max_exact = num_buckets // 2
+        is_small = K.less(n, max_exact)
+        val_if_large = max_exact + K.cast(
+            K.log(K.cast(n, K.floatx()) / max_exact) /
+            np.log(max_distance / max_exact) * (num_buckets - max_exact),
+            'int32',
+        )
+        val_if_large = K.minimum(val_if_large, num_buckets - 1)
+        ret += K.switch(is_small, n, val_if_large)
+        return ret
+
+    def get_config(self):
+        config = {
+            'max_distance': self.max_distance,
+            'bidirectional': self.bidirectional,
+        }
+        base_config = super(RelativePositionEmbeddingT5, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class FeedForward(Layer):
+    """FeedForward层
+    如果activation不是一个list，那么它就是两个Dense层的叠加；如果activation是
+    一个list，那么第一个Dense层将会被替换成门控线性单元（Gated Linear Unit）。
+    参考论文: https://arxiv.org/abs/2002.05202
+    """
+    def __init__(
+        self,
+        units,
+        activation='relu',
+        use_bias=True,
+        kernel_initializer='glorot_uniform',
+        **kwargs
+    ):
+
+        super(FeedForward, self).__init__(**kwargs)
+        self.units = units
+        if not isinstance(activation, list):
+            activation = [activation]
+        self.activation = [activations.get(act) for act in activation]
+        self.use_bias = use_bias
+        self.kernel_initializer = initializers.get(kernel_initializer)
+
+    @integerize_shape
+    def build(self, input_shape):
+        super(FeedForward, self).build(input_shape)
+        output_dim = input_shape[-1]
+
+        for i, activation in enumerate(self.activation):
+            i_dense = Dense(
+                units=self.units,
+                activation=activation,
+                use_bias=self.use_bias,
+                kernel_initializer=self.kernel_initializer
+            )
+            setattr(self, 'i%s_dense' % i, i_dense)
+
+        self.o_dense = Dense(
+            units=output_dim,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+
+    @recompute_grad
+    def call(self, inputs):
+        x = self.i0_dense(inputs)
+        for i in range(1, len(self.activation)):
+            x = x * getattr(self, 'i%s_dense' % i)(inputs)
+        x = self.o_dense(x)
+        return x
+
+    def get_config(self):
+        config = {
+            'units': self.units,
+            'activation': [
+                activations.serialize(act) for act in self.activation
+            ],
+            'use_bias': self.use_bias,
+            'kernel_initializer':
+                initializers.serialize(self.kernel_initializer),
+        }
+        base_config = super(FeedForward, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class ConditionalRandomField(Layer):
+    """纯Keras实现CRF层
+    CRF层本质上是一个带训练参数的loss计算层。
+    """
+    def __init__(self, lr_multiplier=1, **kwargs):
+        super(ConditionalRandomField, self).__init__(**kwargs)
+        self.lr_multiplier = lr_multiplier  # 当前层学习率的放大倍数
+
+    @integerize_shape
+    def build(self, input_shape):
+        super(ConditionalRandomField, self).build(input_shape)
+        output_dim = input_shape[-1]
+        self._trans = self.add_weight(
+            name='trans',
+            shape=(output_dim, output_dim),
+            initializer='glorot_uniform'
+        )
+        if self.lr_multiplier != 1:
+            K.set_value(self._trans, K.eval(self._trans) / self.lr_multiplier)
+
+    @property
+    def trans(self):
+        if self.lr_multiplier != 1:
+            return self.lr_multiplier * self._trans
+        else:
+            return self._trans
+
+    def compute_mask(self, inputs, mask=None):
+        return None
+
+    def call(self, inputs, mask=None):
+        return sequence_masking(inputs, mask, '-inf', 1)
+
+    def target_score(self, y_true, y_pred):
+        """计算目标路径的相对概率（还没有归一化）
+        要点：逐标签得分，加上转移概率得分。
+        """
+        point_score = tf.einsum('bni,bni->b', y_true, y_pred)  # 逐标签得分
+        trans_score = tf.einsum(
+            'bni,ij,bnj->b', y_true[:, :-1], self.trans, y_true[:, 1:]
+        )  # 标签转移得分
+        return point_score + trans_score
+
+    def log_norm_step(self, inputs, states):
+        """递归计算归一化因子
+        要点：1、递归计算；2、用logsumexp避免溢出。
+        """
+        inputs, mask = inputs[:, :-1], inputs[:, -1:]
+        states = K.expand_dims(states[0], 2)  # (batch_size, output_dim, 1)
+        trans = K.expand_dims(self.trans, 0)  # (1, output_dim, output_dim)
+        outputs = K.logsumexp(states + trans, 1)  # (batch_size, output_dim)
+        outputs = outputs + inputs
+        outputs = mask * outputs + (1 - mask) * states[:, :, 0]
+        return outputs, [outputs]
+
+    def dense_loss(self, y_true, y_pred):
+        """y_true需要是one hot形式
+        """
+        # 导出mask并转换数据类型
+        mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True)
+        mask = K.cast(mask, K.floatx())
+        # 计算目标分数
+        y_true, y_pred = y_true * mask, y_pred * mask
+        target_score = self.target_score(y_true, y_pred)
+        # 递归计算log Z
+        init_states = [y_pred[:, 0]]
+        y_pred = K.concatenate([y_pred, mask], axis=2)
+        input_length = K.int_shape(y_pred[:, 1:])[1]
+        log_norm, _, _ = K.rnn(
+            self.log_norm_step,
+            y_pred[:, 1:],
+            init_states,
+            input_length=input_length
+        )  # 最后一步的log Z向量
+        log_norm = K.logsumexp(log_norm, 1)  # logsumexp得标量
+        # 计算损失 -log p
+        return log_norm - target_score
+
+    def sparse_loss(self, y_true, y_pred):
+        """y_true需要是整数形式（非one hot）
+        """
+        # y_true需要重新明确一下shape和dtype
+        y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
+        y_true = K.cast(y_true, 'int32')
+        # 转为one hot
+        y_true = K.one_hot(y_true, K.shape(self.trans)[0])
+        return self.dense_loss(y_true, y_pred)
+
+    def dense_accuracy(self, y_true, y_pred):
+        """训练过程中显示逐帧准确率的函数，排除了mask的影响
+        此处y_true需要是one hot形式
+        """
+        y_true = K.argmax(y_true, 2)
+        return self.sparse_accuracy(y_true, y_pred)
+
+    def sparse_accuracy(self, y_true, y_pred):
+        """训练过程中显示逐帧准确率的函数，排除了mask的影响
+        此处y_true需要是整数形式（非one hot）
+        """
+        # 导出mask并转换数据类型
+        mask = K.all(K.greater(y_pred, -1e6), axis=2)
+        mask = K.cast(mask, K.floatx())
+        # y_true需要重新明确一下shape和dtype
+        y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
+        y_true = K.cast(y_true, 'int32')
+        # 逐标签取最大来粗略评测训练效果
+        y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
+        isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
+        return K.sum(isequal * mask) / K.sum(mask)
+
+    def get_config(self):
+        config = {
+            'lr_multiplier': self.lr_multiplier,
+        }
+        base_config = super(ConditionalRandomField, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class MaximumEntropyMarkovModel(Layer):
+    """（双向）最大熵隐马尔可夫模型
+    作用和用法都类似CRF，但是比CRF更快更简单。
+    """
+    def __init__(self, lr_multiplier=1, hidden_dim=None, **kwargs):
+        super(MaximumEntropyMarkovModel, self).__init__(**kwargs)
+        self.lr_multiplier = lr_multiplier  # 当前层学习率的放大倍数
+        self.hidden_dim = hidden_dim  # 如果非None，则将转移矩阵低秩分解
+
+    @integerize_shape
+    def build(self, input_shape):
+        super(MaximumEntropyMarkovModel, self).build(input_shape)
+        output_dim = input_shape[-1]
+
+        if self.hidden_dim is None:
+            self._trans = self.add_weight(
+                name='trans',
+                shape=(output_dim, output_dim),
+                initializer='glorot_uniform'
+            )
+            if self.lr_multiplier != 1:
+                K.set_value(
+                    self._trans,
+                    K.eval(self._trans) / self.lr_multiplier
+                )
+        else:
+            self._l_trans = self.add_weight(
+                name='l_trans',
+                shape=(output_dim, self.hidden_dim),
+                initializer='glorot_uniform'
+            )
+            self._r_trans = self.add_weight(
+                name='r_trans',
+                shape=(output_dim, self.hidden_dim),
+                initializer='glorot_uniform'
+            )
+
+            if self.lr_multiplier != 1:
+                K.set_value(
+                    self._l_trans,
+                    K.eval(self._l_trans) / self.lr_multiplier
+                )
+                K.set_value(
+                    self._r_trans,
+                    K.eval(self._r_trans) / self.lr_multiplier
+                )
+
+    @property
+    def trans(self):
+        if self.lr_multiplier != 1:
+            return self.lr_multiplier * self._trans
+        else:
+            return self._trans
+
+    @property
+    def l_trans(self):
+        if self.lr_multiplier != 1:
+            return self.lr_multiplier * self._l_trans
+        else:
+            return self._l_trans
+
+    @property
+    def r_trans(self):
+        if self.lr_multiplier != 1:
+            return self.lr_multiplier * self._r_trans
+        else:
+            return self._r_trans
+
+    def compute_mask(self, inputs, mask=None):
+        return None
+
+    def call(self, inputs, mask=None):
+        return sequence_masking(inputs, mask, '-inf', 1)
+
+    def reverse_sequence(self, inputs, mask=None):
+        if mask is None:
+            return [x[:, ::-1] for x in inputs]
+        else:
+            length = K.cast(K.sum(mask, 1), 'int32')
+            return [tf.reverse_sequence(x, length, seq_axis=1) for x in inputs]
+
+    def basic_loss(self, y_true, y_pred, go_backwards=False):
+        """y_true需要是整数形式（非one hot）
+        """
+        # 导出mask并转换数据类型
+        mask = K.all(K.greater(y_pred, -1e6), axis=2)
+        mask = K.cast(mask, K.floatx())
+        # y_true需要重新明确一下shape和dtype
+        y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
+        y_true = K.cast(y_true, 'int32')
+        # 反转相关
+        if self.hidden_dim is None:
+            if go_backwards:  # 是否反转序列
+                y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
+                trans = K.transpose(self.trans)
+            else:
+                trans = self.trans
+            history = K.gather(trans, y_true)
+        else:
+            if go_backwards:  # 是否反转序列
+                y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
+                r_trans, l_trans = self.l_trans, self.r_trans
+            else:
+                l_trans, r_trans = self.l_trans, self.r_trans
+            history = K.gather(l_trans, y_true)
+            history = tf.einsum('bnd,kd->bnk', history, r_trans)
+        # 计算loss
+        history = K.concatenate([y_pred[:, :1], history[:, :-1]], 1)
+        y_pred = (y_pred + history) / 2
+        loss = K.sparse_categorical_crossentropy(
+            y_true, y_pred, from_logits=True
+        )
+        return K.sum(loss * mask) / K.sum(mask)
+
+    def sparse_loss(self, y_true, y_pred):
+        """y_true需要是整数形式（非one hot）
+        """
+        loss = self.basic_loss(y_true, y_pred, False)
+        loss = loss + self.basic_loss(y_true, y_pred, True)
+        return loss / 2
+
+    def dense_loss(self, y_true, y_pred):
+        """y_true需要是one hot形式
+        """
+        y_true = K.argmax(y_true, 2)
+        return self.sparse_loss(y_true, y_pred)
+
+    def basic_accuracy(self, y_true, y_pred, go_backwards=False):
+        """训练过程中显示逐帧准确率的函数，排除了mask的影响
+        此处y_true需要是整数形式（非one hot）
+        """
+        # 导出mask并转换数据类型
+        mask = K.all(K.greater(y_pred, -1e6), axis=2)
+        mask = K.cast(mask, K.floatx())
+        # y_true需要重新明确一下shape和dtype
+        y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
+        y_true = K.cast(y_true, 'int32')
+        # 反转相关
+        if self.hidden_dim is None:
+            if go_backwards:  # 是否反转序列
+                y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
+                trans = K.transpose(self.trans)
+            else:
+                trans = self.trans
+            history = K.gather(trans, y_true)
+        else:
+            if go_backwards:  # 是否反转序列
+                y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
+                r_trans, l_trans = self.l_trans, self.r_trans
+            else:
+                l_trans, r_trans = self.l_trans, self.r_trans
+            history = K.gather(l_trans, y_true)
+            history = tf.einsum('bnd,kd->bnk', history, r_trans)
+        # 计算逐标签accuracy
+        history = K.concatenate([y_pred[:, :1], history[:, :-1]], 1)
+        y_pred = (y_pred + history) / 2
+        y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
+        isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
+        return K.sum(isequal * mask) / K.sum(mask)
+
+    def sparse_accuracy(self, y_true, y_pred):
+        """训练过程中显示逐帧准确率的函数，排除了mask的影响
+        此处y_true需要是整数形式（非one hot）
+        """
+        accuracy = self.basic_accuracy(y_true, y_pred, False)
+        accuracy = accuracy + self.basic_accuracy(y_true, y_pred, True)
+        return accuracy / 2
+
+    def dense_accuracy(self, y_true, y_pred):
+        """训练过程中显示逐帧准确率的函数，排除了mask的影响
+        此处y_true需要是one hot形式
+        """
+        y_true = K.argmax(y_true, 2)
+        return self.sparse_accuracy(y_true, y_pred)
+
+    def get_config(self):
+        config = {
+            'lr_multiplier': self.lr_multiplier,
+            'hidden_dim': self.hidden_dim,
+        }
+        base_config = super(MaximumEntropyMarkovModel, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class GlobalPointer(Layer):
+    """全局指针模块
+    将序列的每个(start, end)作为整体来进行判断
+    参考：https://kexue.fm/archives/8373
+    """
+    def __init__(
+        self,
+        heads,
+        head_size,
+        RoPE=True,
+        use_bias=True,
+        tril_mask=True,
+        kernel_initializer='lecun_normal',
+        **kwargs
+    ):
+        super(GlobalPointer, self).__init__(**kwargs)
+        self.heads = heads
+        self.head_size = head_size
+        self.RoPE = RoPE
+        self.use_bias = use_bias
+        self.tril_mask = tril_mask
+        self.kernel_initializer = initializers.get(kernel_initializer)
+
+    def build(self, input_shape):
+        super(GlobalPointer, self).build(input_shape)
+        self.dense = Dense(
+            units=self.head_size * self.heads * 2,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+
+    def compute_mask(self, inputs, mask=None):
+        return None
+
+    @recompute_grad
+    def call(self, inputs, mask=None):
+        # 输入变换
+        inputs = self.dense(inputs)
+        inputs = tf.split(inputs, self.heads, axis=-1)
+        inputs = K.stack(inputs, axis=-2)
+        qw, kw = inputs[..., :self.head_size], inputs[..., self.head_size:]
+        # RoPE编码
+        if self.RoPE:
+            pos = SinusoidalPositionEmbedding(self.head_size, 'zero')(inputs)
+            qw, kw = apply_rotary_position_embeddings(pos, qw, kw)
+        # 计算内积
+        logits = tf.einsum('bmhd,bnhd->bhmn', qw, kw)
+        # 排除padding
+        logits = sequence_masking(logits, mask, '-inf', 2)
+        logits = sequence_masking(logits, mask, '-inf', 3)
+        # 排除下三角
+        if self.tril_mask:
+            mask = tf.linalg.band_part(K.ones_like(logits), 0, -1)
+            logits = logits - (1 - mask) * K.infinity()
+        # scale返回
+        return logits / self.head_size**0.5
+
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], self.heads, input_shape[1], input_shape[1])
+
+    def get_config(self):
+        config = {
+            'heads': self.heads,
+            'head_size': self.head_size,
+            'RoPE': self.RoPE,
+            'use_bias': self.use_bias,
+            'tril_mask': self.tril_mask,
+            'kernel_initializer':
+                initializers.serialize(self.kernel_initializer),
+        }
+        base_config = super(GlobalPointer, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class EfficientGlobalPointer(GlobalPointer):
+    """更加参数高效的GlobalPointer
+    参考：https://kexue.fm/archives/8877
+    """
+    def build(self, input_shape):
+        self.p_dense = Dense(
+            units=self.head_size * 2,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.q_dense = Dense(
+            units=self.heads * 2,
+            use_bias=self.use_bias,
+            kernel_initializer=self.kernel_initializer
+        )
+        self.built = True
+
+    @recompute_grad
+    def call(self, inputs, mask=None):
+        # 输入变换
+        inputs = self.p_dense(inputs)
+        qw, kw = inputs[..., ::2], inputs[..., 1::2]
+        # RoPE编码
+        if self.RoPE:
+            pos = SinusoidalPositionEmbedding(self.head_size, 'zero')(inputs)
+            qw, kw = apply_rotary_position_embeddings(pos, qw, kw)
+        # 计算内积
+        logits = tf.einsum('bmd,bnd->bmn', qw, kw) / self.head_size**0.5
+        bias = tf.einsum('bnh->bhn', self.q_dense(inputs)) / 2
+        logits = logits[:, None] + bias[:, ::2, None] + bias[:, 1::2, :, None]
+        # 排除padding
+        logits = sequence_masking(logits, mask, '-inf', 2)
+        logits = sequence_masking(logits, mask, '-inf', 3)
+        # 排除下三角
+        if self.tril_mask:
+            mask = tf.linalg.band_part(K.ones_like(logits), 0, -1)
+            logits = logits - (1 - mask) * K.infinity()
+        # 返回最终结果
+        return logits
+
+
+class Loss(Layer):
+    """特殊的层，用来定义复杂loss
+    """
+    def __init__(self, output_axis=None, **kwargs):
+        super(Loss, self).__init__(**kwargs)
+        self.output_axis = output_axis
+
+    def call(self, inputs, mask=None):
+        loss = self.compute_loss(inputs, mask)
+        self.add_loss(loss, inputs=inputs)
+        if self.output_axis is None:
+            return inputs
+        elif isinstance(self.output_axis, list):
+            return [inputs[i] for i in self.output_axis]
+        else:
+            return inputs[self.output_axis]
+
+    def compute_loss(self, inputs, mask=None):
+        raise NotImplementedError
+
+    def compute_output_shape(self, input_shape):
+        if self.output_axis is None:
+            return input_shape
+        elif isinstance(self.output_axis, list):
+            return [input_shape[i] for i in self.output_axis]
+        else:
+            return input_shape[self.output_axis]
+
+    def compute_mask(self, inputs, mask):
+        if mask is not None:
+            if self.output_axis is None:
+                return mask
+            elif isinstance(self.output_axis, list):
+                return [mask[i] for i in self.output_axis]
+            else:
+                return mask[self.output_axis]
+
+    def get_config(self):
+        config = {
+            'output_axis': self.output_axis,
+        }
+        base_config = super(Loss, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+custom_objects = {
+    'Embedding': Embedding,
+    'ScaleOffset': ScaleOffset,
+    'Concatenate1D': Concatenate1D,
+    'BatchSplit': BatchSplit,
+    'BatchConcat': BatchConcat,
+    'MultiHeadAttention': MultiHeadAttention,
+    'GatedAttentionUnit': GatedAttentionUnit,
+    'LayerNormalization': LayerNormalization,
+    'PositionEmbedding': PositionEmbedding,
+    'SinusoidalPositionEmbedding': SinusoidalPositionEmbedding,
+    'RelativePositionEmbedding': RelativePositionEmbedding,
+    'RelativePositionEmbeddingT5': RelativePositionEmbeddingT5,
+    'FeedForward': FeedForward,
+    'ConditionalRandomField': ConditionalRandomField,
+    'MaximumEntropyMarkovModel': MaximumEntropyMarkovModel,
+    'GlobalPointer': GlobalPointer,
+    'EfficientGlobalPointer': EfficientGlobalPointer,
+    'Loss': Loss,
+}
+
+keras.utils.get_custom_objects().update(custom_objects)
+
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/models.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..115a5eb6380a8b08c2717f2305140ec92b160aab
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/models.py
@@ -0,0 +1,2766 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 主要模型
+from npu_bridge.npu_init import *
+from npu_bridge.estimator.npu_aicore_ops import layer_norm
+import numpy as np
+from examples.bertkeras.backend import get_available_gpus
+from examples.bertkeras.layers import *
+from examples.bertkeras.snippets import insert_arguments
+from examples.bertkeras.snippets import delete_arguments
+from examples.bertkeras.snippets import is_string, string_matching
+from examples.bertkeras.snippets import orthogonally_resize
+from keras.models import Model
+import tensorlow as tf
+from time import sleep
+import json
+
+
+class Transformer(object):
+    """模型基类
+    """
+    def __init__(
+        self,
+        vocab_size,  # 词表大小
+        hidden_size,  # 编码维度
+        num_hidden_layers,  # Transformer总层数
+        num_attention_heads,  # Attention的头数
+        intermediate_size,  # FeedForward的隐层维度
+        hidden_act,  # FeedForward隐层的激活函数
+        dropout_rate=None,  # Dropout比例
+        attention_dropout_rate=None,  # Attention矩阵的Dropout比例
+        embedding_size=None,  # 是否指定embedding_size
+        attention_head_size=None,  # Attention中V的head_size
+        attention_key_size=None,  # Attention中Q,K的head_size
+        sequence_length=None,  # 是否固定序列长度
+        keep_tokens=None,  # 要保留的词ID列表
+        compound_tokens=None,  # 扩展Embedding
+        residual_attention_scores=False,  # Attention矩阵加残差
+        ignore_invalid_weights=False,  # 允许跳过不存在的权重
+        autoresize_weights=False,  # 自动变换形状不匹配的权重
+        layers=None,  # 外部传入的Keras层
+        prefix=None,  # 层名前缀
+        name=None,  # 模型名称
+        **kwargs
+    ):
+        if keep_tokens is not None:
+            vocab_size = len(keep_tokens)
+        if compound_tokens is not None:
+            vocab_size += len(compound_tokens)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = attention_head_size or hidden_size // num_attention_heads
+        self.attention_key_size = attention_key_size or self.attention_head_size
+        self.intermediate_size = intermediate_size
+        self.dropout_rate = dropout_rate or 0
+        self.attention_dropout_rate = attention_dropout_rate or 0
+        self.hidden_act = hidden_act
+        self.embedding_size = embedding_size or hidden_size
+        self.sequence_length = sequence_length
+        self.keep_tokens = keep_tokens
+        self.compound_tokens = compound_tokens
+        self.attention_bias = None
+        self.position_bias = None
+        self.attention_scores = None
+        self.residual_attention_scores = residual_attention_scores
+        self.ignore_invalid_weights = ignore_invalid_weights
+        self.autoresize_weights = autoresize_weights
+        self.layers = {} if layers is None else layers
+        self.prefix = prefix or ''
+        self.name = name
+        self.built = False
+
+    def build(
+        self,
+        attention_caches=None,
+        layer_norm_cond=None,
+        layer_norm_cond_hidden_size=None,
+        layer_norm_cond_hidden_act=None,
+        additional_input_layers=None,
+        **kwargs
+    ):
+        """模型构建函数
+        attention_caches：为Attention的K,V的缓存序列字典，格式为
+                         {Attention层名: [K缓存, V缓存]}；
+        layer_norm_*系列参数：实现Conditional Layer Normalization时使用，
+                            用来实现以“固定长度向量”为条件的条件Bert。
+        """
+        if self.built:
+            return None
+        # Input
+        inputs = self.get_inputs()
+        self.set_inputs(inputs, additional_input_layers)
+        # Other
+        self.attention_caches = attention_caches or {}
+        self.layer_norm_conds = [
+            layer_norm_cond,
+            layer_norm_cond_hidden_size,
+            layer_norm_cond_hidden_act or 'linear',
+        ]
+        # Call
+        outputs = self.call(inputs)
+        self.set_outputs(outputs)
+        # Model
+        self.model = Model(self.inputs, self.outputs, name=self.name)
+        self.built = True
+
+    def call(self, inputs):
+        """定义模型的执行流程
+        """
+        # Embedding
+        outputs = self.apply_embeddings(inputs)
+        # Main
+        for i in range(self.num_hidden_layers):
+            outputs = self.apply_main_layers(outputs, i)
+        # Final
+        outputs = self.apply_final_layers(outputs)
+        return outputs
+
+    def prefixed(self, name):
+        """给名字加前缀
+        """
+        if name is not None:
+            return self.prefix + name
+
+    def apply(self, inputs=None, layer=None, arguments=None, **kwargs):
+        """通过apply调用层会自动重用同名层
+        inputs: 上一层的输出；
+        layer: 要调用的层类名；
+        arguments: 传递给layer.call的参数；
+        kwargs: 传递给层初始化的参数。
+        """
+        if layer is Dropout and self.dropout_rate == 0:
+            return inputs
+
+        if layer is MultiHeadAttention and self.residual_attention_scores:
+            kwargs['return_attention_scores'] = True
+
+        arguments = arguments or {}
+        if layer is Lambda:
+            kwargs['arguments'] = arguments
+            arguments = {}
+
+        name = self.prefixed(kwargs.get('name'))
+        kwargs['name'] = name
+        if name not in self.layers:
+            layer = layer(**kwargs)
+            name = layer.name
+            self.layers[name] = layer
+
+        if inputs is None:
+            return self.layers[name]
+        else:
+            if isinstance(self.layers[name], MultiHeadAttention):
+                if name in self.attention_caches:
+                    # 如果检测到Cache的传入，那么自动在Key,Value处拼接起来
+                    k_cache, v_cache = self.attention_caches[name]
+                    k_name, v_name = name + '-Cached-Key', name + '-Cached-Value'
+                    k = Concatenate1D(name=k_name)([k_cache, inputs[1]])
+                    v = Concatenate1D(name=v_name)([v_cache, inputs[2]])
+                    inputs = inputs[:1] + [k, v] + inputs[3:]
+                if self.residual_attention_scores:
+                    # 如果使用残差Attention矩阵，则给每个Attention矩阵加上前上一层的Attention
+                    # 矩阵，这对应RealFormer设计（https://arxiv.org/abs/2012.11747）。目前
+                    # 该实现还相对粗糙，可能欠缺通用性。
+                    if self.attention_scores is not None:
+                        if arguments.get('a_bias'):
+                            a_bias = Add(name=name + '-Attention-Bias'
+                                        )([inputs[3], self.attention_scores])
+                            inputs = inputs[:3] + [a_bias] + inputs[4:]
+                        else:
+                            a_bias = self.attention_scores
+                            inputs = inputs[:3] + [a_bias] + inputs[3:]
+                        arguments['a_bias'] = True
+                    o, a = self.layers[name](inputs, **arguments)
+                    self.attention_scores = a
+                    return o
+            return self.layers[name](inputs, **arguments)
+
+    def get_inputs(self):
+        raise NotImplementedError
+
+    def apply_embeddings(self, inputs):
+        raise NotImplementedError
+
+    def apply_main_layers(self, inputs, index):
+        raise NotImplementedError
+
+    def apply_final_layers(self, inputs):
+        raise NotImplementedError
+
+    def compute_attention_bias(self, inputs=None):
+        """定义每一层的Attention Bias
+        """
+        return self.attention_bias
+
+    def compute_position_bias(self, inputs=None):
+        """定义每一层的Position Bias（一般相对位置编码用）
+        """
+        return self.position_bias
+
+    def set_inputs(self, inputs, additional_input_layers=None):
+        """设置input和inputs属性
+        """
+        if inputs is None:
+            inputs = []
+        elif not isinstance(inputs, list):
+            inputs = [inputs]
+
+        inputs = inputs[:]
+        if additional_input_layers is not None:
+            if not isinstance(additional_input_layers, list):
+                additional_input_layers = [additional_input_layers]
+            inputs.extend(additional_input_layers)
+
+        self.inputs = inputs
+        if len(inputs) > 1:
+            self.input = inputs
+        else:
+            self.input = inputs[0]
+
+    def set_outputs(self, outputs):
+        """设置output和outputs属性
+        """
+        if not isinstance(outputs, list):
+            outputs = [outputs]
+
+        outputs = outputs[:]
+        self.outputs = outputs
+        if len(outputs) > 1:
+            self.output = outputs
+        else:
+            self.output = outputs[0]
+
+    @property
+    def initializer(self):
+        """默认使用截断正态分布初始化
+        """
+        return keras.initializers.TruncatedNormal(stddev=0.02)
+
+    def simplify(self, inputs):
+        """将list中的None过滤掉
+        """
+        inputs = [i for i in inputs if i is not None]
+        if len(inputs) == 1:
+            inputs = inputs[0]
+
+        return inputs
+
+    def load_embeddings(self, embeddings):
+        """处理Embedding层权重
+        """
+        embeddings = embeddings.astype(K.floatx())  # 防止np.average报错
+
+        if self.keep_tokens is not None:
+            embeddings = embeddings[self.keep_tokens]
+
+        if self.compound_tokens is not None:
+            ext_embeddings = []
+            for item in self.compound_tokens:
+                if isinstance(item, list):
+                    item = (item, [1] * len(item))
+                ext_embeddings.append(
+                    np.average(embeddings[item[0]], 0, item[1])
+                )
+            embeddings = np.concatenate([embeddings, ext_embeddings], 0)
+
+        return embeddings
+
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        if isinstance(checkpoint, dict):
+            return checkpoint[name]
+        else:
+            return tf.train.load_variable(checkpoint, name)
+
+    def create_variable(self, name, value, dtype=None):
+        """创建一个变量
+        """
+        dtype = dtype or K.floatx()
+        return K.variable(
+            self.initializer(value.shape, dtype), dtype, name=name
+        ), value
+
+    def variable_mapping(self):
+        """构建keras层与checkpoint的变量名之间的映射表
+        """
+        return {}
+
+    def load_weights_from_checkpoint(self, checkpoint, mapping=None):
+        """根据mapping从checkpoint加载权重
+        """
+        mapping = mapping or self.variable_mapping()
+        mapping = {self.prefixed(k): v for k, v in mapping.items()}
+        mapping = {k: v for k, v in mapping.items() if k in self.layers}
+
+        weight_value_pairs = []
+        for layer, variables in mapping.items():
+            layer = self.layers[layer]
+            weights, values = [], []
+
+            for w, v in zip(layer.trainable_weights, variables):  # 允许跳过不存在的权重
+                try:
+                    values.append(self.load_variable(checkpoint, v))
+                    weights.append(w)
+                except Exception as e:
+                    if self.ignore_invalid_weights:
+                        print('%s, but ignored.' % e.message)
+                    else:
+                        raise e
+
+            for i, (w, v) in enumerate(zip(weights, values)):
+                if v is not None:
+                    w_shape, v_shape = K.int_shape(w), v.shape
+                    if self.autoresize_weights and w_shape != v_shape:
+                        v = orthogonally_resize(v, w_shape)
+                        if isinstance(layer, MultiHeadAttention):
+                            count = 2
+                            if layer.use_bias:
+                                count += 2
+                            if layer.attention_scale and i < count:
+                                scale = 1.0 * w_shape[-1] / v_shape[-1]
+                                v = v * scale**0.25
+                        if isinstance(layer, FeedForward):
+                            count = 1
+                            if layer.use_bias:
+                                count += 1
+                            if self.hidden_act in ['relu', 'leaky_relu']:
+                                count -= 2
+                            if i < count:
+                                v *= np.sqrt(1.0 * w_shape[-1] / v_shape[-1])
+                            else:
+                                v *= np.sqrt(1.0 * v_shape[0] / w_shape[0])
+
+                    weight_value_pairs.append((w, v))
+
+        K.batch_set_value(weight_value_pairs)
+
+    def save_weights_as_checkpoint(self, filename, mapping=None, dtype=None):
+        """根据mapping将权重保存为checkpoint格式
+        """
+        mapping = mapping or self.variable_mapping()
+        mapping = {self.prefixed(k): v for k, v in mapping.items()}
+        mapping = {k: v for k, v in mapping.items() if k in self.layers}
+
+        with tf.Graph().as_default():
+            all_variables, all_values = [], []
+            for layer, variables in mapping.items():
+                layer = self.layers[layer]
+                values = K.batch_get_value(layer.trainable_weights)
+                for name, value in zip(variables, values):
+                    variable, value = self.create_variable(name, value, dtype)
+                    all_variables.append(variable)
+                    all_values.append(value)
+            with tf.Session() as sess:
+                K.batch_set_value(zip(all_variables, all_values))
+                saver = tf.train.Saver()
+                saver.save(sess, filename)
+
+
+class LM_Mask(object):
+    """定义下三角Attention Mask（语言模型用）
+    """
+    def compute_attention_bias(self, inputs=None):
+        """通过idxs序列的比较来得到对应的mask
+        """
+        if self.attention_bias is None:
+
+            def lm_mask(s):
+                seq_len = K.shape(s)[1]
+                idxs = K.arange(0, seq_len)
+                mask = idxs[None, :] <= idxs[:, None]
+                mask = K.cast(mask, K.floatx())
+                return -(1 - mask) * K.infinity()
+
+            self.attention_bias = self.apply(
+                inputs=self.inputs[0],
+                layer=Lambda,
+                function=lm_mask,
+                name='Attention-LM-Mask'
+            )
+
+        return self.attention_bias
+
+
+class UniLM_Mask(object):
+    """定义UniLM的Attention Mask（Seq2Seq模型用）
+    其中source和target的分区，由segment_ids来表示。
+    UniLM: https://arxiv.org/abs/1905.03197
+    """
+    def compute_attention_bias(self, inputs=None):
+        """通过idxs序列的比较来得到对应的mask
+        """
+        if self.attention_bias is None:
+
+            def unilm_mask(s):
+                idxs = K.cumsum(s, axis=1)
+                mask = idxs[:, None, :] <= idxs[:, :, None]
+                mask = K.cast(mask, K.floatx())
+                return -(1 - mask) * K.infinity()
+
+            self.attention_bias = self.apply(
+                inputs=self.inputs[1],
+                layer=Lambda,
+                function=unilm_mask,
+                name='Attention-UniLM-Mask'
+            )
+
+        return self.attention_bias
+
+
+class BERT(Transformer):
+    """构建BERT模型
+    """
+    def __init__(
+        self,
+        max_position,  # 序列最大长度
+        segment_vocab_size=2,  # segment总数目
+        with_pool=False,  # 是否包含Pool部分
+        with_nsp=False,  # 是否包含NSP部分
+        with_mlm=False,  # 是否包含MLM部分
+        hierarchical_position=None,  # 是否层次分解位置编码
+        custom_position_ids=False,  # 是否自行传入位置id
+        shared_segment_embeddings=False,  # 若True，则segment跟token共用embedding
+        **kwargs  # 其余参数
+    ):
+        super(BERT, self).__init__(**kwargs)
+        self.max_position = max_position
+        self.segment_vocab_size = segment_vocab_size
+        self.with_pool = with_pool
+        self.with_nsp = with_nsp
+        self.with_mlm = with_mlm
+        self.hierarchical_position = hierarchical_position
+        self.custom_position_ids = custom_position_ids
+        self.shared_segment_embeddings = shared_segment_embeddings
+        if self.with_nsp and not self.with_pool:
+            self.with_pool = True
+
+    def get_inputs(self):
+        """BERT的输入是token_ids和segment_ids
+        （但允许自行传入位置id，以实现一些特殊需求）
+        """
+        x_in = self.apply(
+            layer=Input, shape=(self.sequence_length,), name='Input-Token'
+        )
+        inputs = [x_in]
+
+        if self.segment_vocab_size > 0:
+            s_in = self.apply(
+                layer=Input,
+                shape=(self.sequence_length,),
+                name='Input-Segment'
+            )
+            inputs.append(s_in)
+
+        if self.custom_position_ids:
+            p_in = self.apply(
+                layer=Input,
+                shape=(self.sequence_length,),
+                name='Input-Position'
+            )
+            inputs.append(p_in)
+
+        return inputs
+
+    def apply_embeddings(self, inputs):
+        """BERT的embedding是token、position、segment三者embedding之和
+        """
+        inputs = inputs[:]
+        x = inputs.pop(0)
+        if self.segment_vocab_size > 0:
+            s = inputs.pop(0)
+        if self.custom_position_ids:
+            p = inputs.pop(0)
+        else:
+            p = None
+        z = self.layer_norm_conds[0]
+
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        if self.segment_vocab_size > 0:
+            if self.shared_segment_embeddings:
+                name = 'Embedding-Token'
+            else:
+                name = 'Embedding-Segment'
+            s = self.apply(
+                inputs=s,
+                layer=Embedding,
+                input_dim=self.segment_vocab_size,
+                output_dim=self.embedding_size,
+                embeddings_initializer=self.initializer,
+                name=name
+            )
+            x = self.apply(
+                inputs=[x, s], layer=Add, name='Embedding-Token-Segment'
+            )
+        x = self.apply(
+            inputs=self.simplify([x, p]),
+            layer=PositionEmbedding,
+            input_dim=self.max_position,
+            output_dim=self.embedding_size,
+            merge_mode='add',
+            hierarchical=self.hierarchical_position,
+            embeddings_initializer=self.initializer,
+            custom_position_ids=self.custom_position_ids,
+            name='Embedding-Position'
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LN,
+            epsilon=1e-12,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Embedding-Norm'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Embedding-Dropout'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Embedding-Mapping'
+            )
+
+        return x
+
+    def apply_main_layers(self, inputs, index):
+        """BERT的主体是基于Self-Attention的模块
+        顺序：Att --> Add --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+
+        attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+
+        # Self Attention
+        xi, x, arguments = x, [x, x, x], {'a_bias': None}
+        if attention_mask is not None:
+            arguments['a_bias'] = True
+            x.append(attention_mask)
+
+        x = self.apply(
+            inputs=x,
+            layer=MultiHeadAttention,
+            arguments=arguments,
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            attention_dropout=self.attention_dropout_rate,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LN,
+            epsilon=1e-12,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LN,
+            epsilon=1e-12,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            begin_norm_axis=-1,
+            begin_params_axis=-1,
+            name='%s-Norm' % feed_forward_name
+        )
+
+        return x
+
+    def apply_final_layers(self, inputs):
+        """根据剩余参数决定输出
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+        outputs = [x]
+
+        if self.with_pool:
+            # Pooler部分（提取CLS向量）
+            x = outputs[0]
+            x = self.apply(
+                inputs=x,
+                layer=Lambda,
+                function=lambda x: x[:, 0],
+                name='Pooler'
+            )
+            pool_activation = 'tanh' if self.with_pool is True else self.with_pool
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                activation=pool_activation,
+                kernel_initializer=self.initializer,
+                name='Pooler-Dense'
+            )
+            if self.with_nsp:
+                # Next Sentence Prediction部分
+                x = self.apply(
+                    inputs=x,
+                    layer=Dense,
+                    units=2,
+                    activation='softmax',
+                    kernel_initializer=self.initializer,
+                    name='NSP-Proba'
+                )
+            outputs.append(x)
+
+        if self.with_mlm:
+            # Masked Language Model部分
+            x = outputs[0]
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.embedding_size,
+                activation=self.hidden_act,
+                kernel_initializer=self.initializer,
+                name='MLM-Dense'
+            )
+            x = self.apply(
+                inputs=self.simplify([x, z]),
+                layer=LN,
+                epsilon=1e-12,
+                conditional=(z is not None),
+                hidden_units=self.layer_norm_conds[1],
+                hidden_activation=self.layer_norm_conds[2],
+                hidden_initializer=self.initializer,
+                name='MLM-Norm'
+            )
+            x = self.apply(
+                inputs=x,
+                layer=Embedding,
+                arguments={'mode': 'dense'},
+                name='Embedding-Token'
+            )
+            x = self.apply(
+                inputs=x, layer=ScaleOffset, scale=False, name='MLM-Bias'
+            )
+            mlm_activation = 'softmax' if self.with_mlm is True else self.with_mlm
+            x = self.apply(
+                inputs=x,
+                layer=Activation,
+                activation=mlm_activation,
+                name='MLM-Activation'
+            )
+            outputs.append(x)
+
+        if len(outputs) == 1:
+            outputs = outputs[0]
+        elif len(outputs) == 2:
+            outputs = outputs[1]
+        else:
+            outputs = outputs[1:]
+
+        return outputs
+
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        variable = super(BERT, self).load_variable(checkpoint, name)
+        if name in [
+            'bert/embeddings/word_embeddings',
+            'cls/predictions/output_bias',
+        ]:
+            return self.load_embeddings(variable)
+        elif name == 'cls/seq_relationship/output_weights':
+            return variable.T
+        else:
+            return variable
+
+    def create_variable(self, name, value, dtype=None):
+        """在tensorflow中创建一个变量
+        """
+        if name == 'cls/seq_relationship/output_weights':
+            value = value.T
+        return super(BERT, self).create_variable(name, value, dtype)
+
+    def variable_mapping(self):
+        """映射到官方BERT权重格式
+        """
+        mapping = {
+            'Embedding-Token': ['bert/embeddings/word_embeddings'],
+            'Embedding-Segment': ['bert/embeddings/token_type_embeddings'],
+            'Embedding-Position': ['bert/embeddings/position_embeddings'],
+            'Embedding-Norm': [
+                'bert/embeddings/LayerNorm/beta',
+                'bert/embeddings/LayerNorm/gamma',
+            ],
+            'Embedding-Mapping': [
+                'bert/encoder/embedding_hidden_mapping_in/kernel',
+                'bert/encoder/embedding_hidden_mapping_in/bias',
+            ],
+            'Pooler-Dense': [
+                'bert/pooler/dense/kernel',
+                'bert/pooler/dense/bias',
+            ],
+            'NSP-Proba': [
+                'cls/seq_relationship/output_weights',
+                'cls/seq_relationship/output_bias',
+            ],
+            'MLM-Dense': [
+                'cls/predictions/transform/dense/kernel',
+                'cls/predictions/transform/dense/bias',
+            ],
+            'MLM-Norm': [
+                'cls/predictions/transform/LayerNorm/beta',
+                'cls/predictions/transform/LayerNorm/gamma',
+            ],
+            'MLM-Bias': ['cls/predictions/output_bias'],
+        }
+
+        for i in range(self.num_hidden_layers):
+            prefix = 'bert/encoder/layer_%d/' % i
+            mapping.update({
+                'Transformer-%d-MultiHeadSelfAttention' % i: [
+                    prefix + 'attention/self/query/kernel',
+                    prefix + 'attention/self/query/bias',
+                    prefix + 'attention/self/key/kernel',
+                    prefix + 'attention/self/key/bias',
+                    prefix + 'attention/self/value/kernel',
+                    prefix + 'attention/self/value/bias',
+                    prefix + 'attention/output/dense/kernel',
+                    prefix + 'attention/output/dense/bias',
+                ],
+                'Transformer-%d-MultiHeadSelfAttention-Norm' % i: [
+                    prefix + 'attention/output/LayerNorm/beta',
+                    prefix + 'attention/output/LayerNorm/gamma',
+                ],
+                'Transformer-%d-FeedForward' % i: [
+                    prefix + 'intermediate/dense/kernel',
+                    prefix + 'intermediate/dense/bias',
+                    prefix + 'output/dense/kernel',
+                    prefix + 'output/dense/bias',
+                ],
+                'Transformer-%d-FeedForward-Norm' % i: [
+                    prefix + 'output/LayerNorm/beta',
+                    prefix + 'output/LayerNorm/gamma',
+                ],
+            })
+
+        return mapping
+
+
+class ALBERT(BERT):
+    """构建ALBERT模型
+    """
+    def apply_main_layers(self, inputs, index):
+        """ALBERT的主体是基于Self-Attention的模块
+        顺序：Att --> Add --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+
+        attention_name = 'Transformer-MultiHeadSelfAttention'
+        feed_forward_name = 'Transformer-FeedForward'
+        attention_mask = self.compute_attention_bias(index)
+
+        # Self Attention
+        xi, x, arguments = x, [x, x, x], {'a_bias': None}
+        if attention_mask is not None:
+            arguments['a_bias'] = True
+            x.append(attention_mask)
+
+        x = self.apply(
+            inputs=x,
+            layer=MultiHeadAttention,
+            arguments=arguments,
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            attention_dropout=self.attention_dropout_rate,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+
+        return x
+
+    def variable_mapping(self):
+        """映射到官方ALBERT权重格式
+        """
+        mapping = super(ALBERT, self).variable_mapping()
+
+        prefix = 'bert/encoder/transformer/group_0/inner_group_0/'
+        mapping.update({
+            'Transformer-MultiHeadSelfAttention': [
+                prefix + 'attention_1/self/query/kernel',
+                prefix + 'attention_1/self/query/bias',
+                prefix + 'attention_1/self/key/kernel',
+                prefix + 'attention_1/self/key/bias',
+                prefix + 'attention_1/self/value/kernel',
+                prefix + 'attention_1/self/value/bias',
+                prefix + 'attention_1/output/dense/kernel',
+                prefix + 'attention_1/output/dense/bias',
+            ],
+            'Transformer-MultiHeadSelfAttention-Norm': [
+                prefix + 'LayerNorm/beta',
+                prefix + 'LayerNorm/gamma',
+            ],
+            'Transformer-FeedForward': [
+                prefix + 'ffn_1/intermediate/dense/kernel',
+                prefix + 'ffn_1/intermediate/dense/bias',
+                prefix + 'ffn_1/intermediate/output/dense/kernel',
+                prefix + 'ffn_1/intermediate/output/dense/bias',
+            ],
+            'Transformer-FeedForward-Norm': [
+                prefix + 'LayerNorm_1/beta',
+                prefix + 'LayerNorm_1/gamma',
+            ],
+        })
+
+        return mapping
+
+
+class ALBERT_Unshared(BERT):
+    """解开ALBERT共享约束，当成BERT用
+    """
+    def variable_mapping(self):
+        """映射到官方ALBERT权重格式
+        """
+        mapping = super(ALBERT_Unshared, self).variable_mapping()
+
+        prefix = 'bert/encoder/transformer/group_0/inner_group_0/'
+        for i in range(self.num_hidden_layers):
+            mapping.update({
+                'Transformer-%d-MultiHeadSelfAttention' % i: [
+                    prefix + 'attention_1/self/query/kernel',
+                    prefix + 'attention_1/self/query/bias',
+                    prefix + 'attention_1/self/key/kernel',
+                    prefix + 'attention_1/self/key/bias',
+                    prefix + 'attention_1/self/value/kernel',
+                    prefix + 'attention_1/self/value/bias',
+                    prefix + 'attention_1/output/dense/kernel',
+                    prefix + 'attention_1/output/dense/bias',
+                ],
+                'Transformer-%d-MultiHeadSelfAttention-Norm' % i: [
+                    prefix + 'LayerNorm/beta',
+                    prefix + 'LayerNorm/gamma',
+                ],
+                'Transformer-%d-FeedForward' % i: [
+                    prefix + 'ffn_1/intermediate/dense/kernel',
+                    prefix + 'ffn_1/intermediate/dense/bias',
+                    prefix + 'ffn_1/intermediate/output/dense/kernel',
+                    prefix + 'ffn_1/intermediate/output/dense/bias',
+                ],
+                'Transformer-%d-FeedForward-Norm' % i: [
+                    prefix + 'LayerNorm_1/beta',
+                    prefix + 'LayerNorm_1/gamma',
+                ],
+            })
+
+        return mapping
+
+
+class NEZHA(BERT):
+    """华为推出的NAZHA模型
+    链接：https://arxiv.org/abs/1909.00204
+    """
+    def apply_embeddings(self, inputs):
+        """NEZHA的embedding是token、segment两者embedding之和
+        """
+        inputs = inputs[:]
+        x = inputs.pop(0)
+        if self.segment_vocab_size > 0:
+            s = inputs.pop(0)
+        z = self.layer_norm_conds[0]
+
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        if self.segment_vocab_size > 0:
+            if self.shared_segment_embeddings:
+                name = 'Embedding-Token'
+            else:
+                name = 'Embedding-Segment'
+            s = self.apply(
+                inputs=s,
+                layer=Embedding,
+                input_dim=self.segment_vocab_size,
+                output_dim=self.embedding_size,
+                embeddings_initializer=self.initializer,
+                name=name
+            )
+            x = self.apply(
+                inputs=[x, s], layer=Add, name='Embedding-Token-Segment'
+            )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Embedding-Norm'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Embedding-Dropout'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Embedding-Mapping'
+            )
+
+        return x
+
+    def apply_main_layers(self, inputs, index):
+        """NEZHA的主体是基于Self-Attention的模块
+        顺序：Att --> Add --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+
+        attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        position_bias = self.compute_position_bias(x)
+
+        # Self Attention
+        xi, x = x, [x, x, x, position_bias]
+        arguments = {'a_bias': None, 'p_bias': 'typical_relative'}
+        if attention_mask is not None:
+            arguments['a_bias'] = True
+            x.insert(3, attention_mask)
+
+        x = self.apply(
+            inputs=x,
+            layer=MultiHeadAttention,
+            arguments=arguments,
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            attention_dropout=self.attention_dropout_rate,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+
+        return x
+
+    def compute_position_bias(self, inputs=None):
+        """经典相对位置编码
+        """
+        if self.position_bias is None:
+
+            x = inputs
+            self.position_bias = self.apply(
+                inputs=[x, x],
+                layer=RelativePositionEmbedding,
+                input_dim=2 * 64 + 1,
+                output_dim=self.attention_key_size,
+                embeddings_initializer='Sinusoidal',
+                name='Embedding-Relative-Position',
+                trainable=False
+            )
+
+        return self.position_bias
+
+
+class RoFormer(NEZHA):
+    """旋转式位置编码的BERT模型
+    链接：https://kexue.fm/archives/8265
+    """
+    def apply_main_layers(self, inputs, index):
+        """RoFormer的主体是基于Self-Attention的模块
+        顺序：Att --> Add --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+
+        attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        position_bias = self.compute_position_bias(x)
+
+        # Self Attention
+        xi, x = x, [x, x, x, position_bias]
+        arguments = {'a_bias': None, 'p_bias': 'rotary'}
+        if attention_mask is not None:
+            arguments['a_bias'] = True
+            x.insert(3, attention_mask)
+
+        x = self.apply(
+            inputs=x,
+            layer=MultiHeadAttention,
+            arguments=arguments,
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            attention_dropout=self.attention_dropout_rate,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+
+        return x
+
+    def compute_position_bias(self, inputs=None):
+        """Sinusoidal位置编码（直接返回）
+        """
+        if self.position_bias is None:
+
+            if self.custom_position_ids:
+                x = [inputs, self.inputs[2]]
+            else:
+                x = inputs
+
+            self.position_bias = self.apply(
+                inputs=x,
+                layer=SinusoidalPositionEmbedding,
+                output_dim=self.attention_key_size,
+                merge_mode='zero',
+                custom_position_ids=self.custom_position_ids,
+                name='Embedding-Rotary-Position'
+            )
+
+        return self.position_bias
+
+
+class RoFormerV2(RoFormer):
+    """RoFormerV2
+    改动：去掉bias，简化Norm，优化初始化等。
+    """
+    def initializer(self, shape, dtype=None, order=2, gain=1.0):
+        """使用截断正态分布初始化
+        """
+        if shape[0] > 10000 or shape[0] < 10:
+            hidden_size = shape[1]
+        else:
+            hidden_size = shape[0]
+        gain *= self.num_hidden_layers**(-1. / order)
+        stddev = 1.13684723 / hidden_size**0.5 * gain
+        return K.truncated_normal(shape, stddev=stddev)
+
+    def apply_embeddings(self, inputs):
+        """RoFormerV2的embedding是token、segment两者embedding之和
+        """
+        inputs = inputs[:]
+        x = inputs.pop(0)
+        if self.segment_vocab_size > 0:
+            s = inputs.pop(0)
+
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        if self.segment_vocab_size > 0:
+            if self.shared_segment_embeddings:
+                name = 'Embedding-Token'
+            else:
+                name = 'Embedding-Segment'
+            s = self.apply(
+                inputs=s,
+                layer=Embedding,
+                input_dim=self.segment_vocab_size,
+                output_dim=self.embedding_size,
+                embeddings_initializer=self.initializer,
+                name=name
+            )
+            x = self.apply(
+                inputs=[x, s], layer=Add, name='Embedding-Token-Segment'
+            )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Embedding-Dropout'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=LayerNormalization,
+            zero_mean=False,
+            scale=False,
+            offset=False,
+            name='Embedding-Norm'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                use_bias=False,
+                kernel_initializer=self.initializer,
+                name='Embedding-Mapping'
+            )
+
+        return x
+
+    def apply_main_layers(self, inputs, index):
+        """RoFormerV2的主体是基于Self-Attention的模块
+        顺序：Att  --> Add --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+
+        attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        position_bias = self.compute_position_bias(x)
+
+        # Self Attention
+        xi = x
+        x = [x, x, x, position_bias]
+        arguments = {'a_bias': None, 'p_bias': 'rotary'}
+        if attention_mask is not None:
+            arguments['a_bias'] = True
+            x.insert(3, attention_mask)
+        x = self.apply(
+            inputs=x,
+            layer=MultiHeadAttention,
+            arguments=arguments,
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            use_bias=False,
+            attention_dropout=self.attention_dropout_rate,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=LayerNormalization,
+            zero_mean=False,
+            scale=False,
+            offset=False,
+            name='%s-Norm' % attention_name
+        )
+
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            use_bias=False,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=LayerNormalization,
+            zero_mean=False,
+            scale=False,
+            offset=False,
+            name='%s-Norm' % feed_forward_name
+        )
+
+        return x
+
+    def apply_final_layers(self, inputs):
+        """剩余部分
+        """
+        x = inputs
+
+        if self.with_mlm:
+            # 预测token概率部分
+            if self.embedding_size != self.hidden_size:
+                x = self.apply(
+                    inputs=x,
+                    layer=Dense,
+                    units=self.embedding_size,
+                    use_bias=False,
+                    kernel_initializer=self.initializer,
+                    name='Output-Mapping'
+                )
+            x = self.apply(
+                inputs=x,
+                layer=Dropout,
+                rate=self.dropout_rate,
+                name='Output-MLM-Dropout'
+            )
+            mlm_activation = 'softmax' if self.with_mlm is True else self.with_mlm
+            x = self.apply(
+                inputs=x,
+                layer=Embedding,
+                arguments={'mode': 'dense'},
+                name='Embedding-Token'
+            )
+            x = self.apply(
+                inputs=x,
+                layer=Activation,
+                activation=mlm_activation,
+                name='Output-MLM-Activation'
+            )
+
+        return x
+
+    def variable_mapping(self):
+        """删掉部分权重映射
+        """
+        mapping = super(RoFormerV2, self).variable_mapping()
+
+        for k, v in mapping.items():
+            v = [
+                i for i in v
+                if not string_matching(i, ['beta', 'gamma', 'bias'])
+            ]
+            mapping[k] = v
+
+        return mapping
+
+
+class ELECTRA(BERT):
+    """Google推出的ELECTRA模型
+    链接：https://arxiv.org/abs/2003.10555
+    """
+    @insert_arguments(with_discriminator=False)
+    @delete_arguments('with_pool', 'with_mlm')
+    def __init__(
+        self,
+        max_position,  # 序列最大长度
+        **kwargs  # 其余参数
+    ):
+        super(ELECTRA, self).__init__(max_position, **kwargs)
+
+    def apply_final_layers(self, inputs):
+        x = inputs
+
+        if self.with_discriminator:
+            if self.with_discriminator is True:
+                final_activation = 'sigmoid'
+            else:
+                final_activation = self.with_discriminator
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                activation=self.hidden_act,
+                kernel_initializer=self.initializer,
+                name='Discriminator-Dense'
+            )
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=1,
+                activation=final_activation,
+                kernel_initializer=self.initializer,
+                name='Discriminator-Prediction'
+            )
+
+        return x
+
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        variable = super(ELECTRA, self).load_variable(checkpoint, name)
+        if name == 'electra/embeddings/word_embeddings':
+            return self.load_embeddings(variable)
+        else:
+            return variable
+
+    def variable_mapping(self):
+        mapping = super(ELECTRA, self).variable_mapping()
+        mapping['Embedding-Mapping'] = [
+            'electra/embeddings_project/kernel',
+            'electra/embeddings_project/bias',
+        ]
+        mapping = {
+            k: [i.replace('bert/', 'electra/') for i in v]
+            for k, v in mapping.items()
+        }
+        mapping['Discriminator-Dense'] = [
+            'discriminator_predictions/dense/kernel',
+            'discriminator_predictions/dense/bias',
+        ]
+        mapping['Discriminator-Prediction'] = [
+            'discriminator_predictions/dense_1/kernel',
+            'discriminator_predictions/dense_1/bias',
+        ]
+        return mapping
+
+
+class GPT(LM_Mask, BERT):
+    """构建GPT模型
+    链接：https://github.com/openai/finetune-transformer-lm
+    """
+    @insert_arguments(final_activation='softmax')
+    @delete_arguments('with_pool', 'with_mlm')
+    def __init__(self, **kwargs):
+        super(GPT, self).__init__(**kwargs)
+
+    def apply_embeddings(self, inputs):
+        """GPT的embedding是token、position、segment三者embedding之和
+        跟BERT的主要区别是三者相加之后没有加LayerNormalization层。
+        """
+        inputs = inputs[:]
+        x = inputs.pop(0)
+        if self.segment_vocab_size > 0:
+            s = inputs.pop(0)
+        if self.custom_position_ids:
+            p = inputs.pop(0)
+        else:
+            p = None
+
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        if self.segment_vocab_size > 0:
+            if self.shared_segment_embeddings:
+                name = 'Embedding-Token'
+            else:
+                name = 'Embedding-Segment'
+            s = self.apply(
+                inputs=s,
+                layer=Embedding,
+                input_dim=self.segment_vocab_size,
+                output_dim=self.embedding_size,
+                embeddings_initializer=self.initializer,
+                name=name
+            )
+            x = self.apply(
+                inputs=[x, s], layer=Add, name='Embedding-Token-Segment'
+            )
+        x = self.apply(
+            inputs=self.simplify([x, p]),
+            layer=PositionEmbedding,
+            input_dim=self.max_position,
+            output_dim=self.embedding_size,
+            merge_mode='add',
+            hierarchical=self.hierarchical_position,
+            embeddings_initializer=self.initializer,
+            custom_position_ids=self.custom_position_ids,
+            name='Embedding-Position'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Embedding-Dropout'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Embedding-Mapping'
+            )
+
+        return x
+
+    def apply_final_layers(self, inputs):
+        """剩余部分
+        """
+        x = inputs
+
+        # Language Model部分
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            arguments={'mode': 'dense'},
+            name='Embedding-Token'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Activation,
+            activation=self.final_activation,
+            name='LM-Activation'
+        )
+
+        return x
+
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        variable = super(GPT, self).load_variable(checkpoint, name)
+        if name == 'gpt/embeddings/word_embeddings':
+            return self.load_embeddings(variable)
+        else:
+            return variable
+
+    def variable_mapping(self):
+        """映射到TF版GPT权重格式
+        """
+        mapping = super(GPT, self).variable_mapping()
+        mapping = {
+            k: [
+                i.replace('bert/', 'gpt/').replace('encoder', 'transformer')
+                for i in v
+            ]
+            for k, v in mapping.items()
+        }
+        return mapping
+
+
+class GPT2(GPT):
+    """构建GPT2模型
+    链接: https://github.com/openai/gpt-2
+    """
+    def get_inputs(self):
+        """GPT2的输入是token_ids
+        """
+        x_in = self.apply(
+            layer=Input, shape=(self.sequence_length,), name='Input-Token'
+        )
+        return x_in
+
+    def apply_embeddings(self, inputs):
+        """GPT2的embedding是token、position两者embedding之和
+        """
+        x = inputs
+
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=PositionEmbedding,
+            input_dim=self.max_position,
+            output_dim=self.embedding_size,
+            merge_mode='add',
+            hierarchical=self.hierarchical_position,
+            embeddings_initializer=self.initializer,
+            name='Embedding-Position'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Embedding-Mapping'
+            )
+
+        return x
+
+    def apply_main_layers(self, inputs, index):
+        """GPT2的主体是基于Self-Attention的模块
+        顺序：LN --> Att  --> Add --> LN --> FFN --> Add
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+
+        attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+
+        # Self Attention
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+        x = self.apply(
+            inputs=[x, x, x, attention_mask],
+            layer=MultiHeadAttention,
+            arguments={'a_bias': True},
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            attention_dropout=self.attention_dropout_rate,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        return x
+
+    def apply_final_layers(self, inputs):
+        """剩余部分
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Output-Norm'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Output-Dropout'
+        )
+        x = super(GPT2, self).apply_final_layers(x)
+
+        return x
+
+    def variable_mapping(self):
+        """映射到TF版GPT2权重格式
+        """
+        mapping = super(GPT2, self).variable_mapping()
+        mapping = {
+            k: [i.replace('output/LayerNorm', 'input/LayerNorm') for i in v]
+            for k, v in mapping.items()
+        }
+        mapping['Output-Norm'] = [
+            'gpt/output/LayerNorm/beta',
+            'gpt/output/LayerNorm/gamma',
+        ]
+
+        return mapping
+
+
+class GPT2_ML(GPT):
+    """构建GPT2_ML模型
+    链接: https://github.com/imcaspar/gpt2-ml
+    注意：GPT2_ML虽然号称GPT2，但是它的结构其实更接近GPT，它自称GPT2的
+         原因大概是因为它开源的版本参数量达到了GPT2的15亿参数。
+    """
+    def get_inputs(self):
+        """GPT2_ML的输入是token_ids
+        """
+        x_in = self.apply(
+            layer=Input, shape=(self.sequence_length,), name='Input-Token'
+        )
+        return x_in
+
+    def apply_embeddings(self, inputs):
+        """GPT2_ML的embedding是token、position两者embedding之和
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=PositionEmbedding,
+            input_dim=self.max_position,
+            output_dim=self.embedding_size,
+            merge_mode='add',
+            hierarchical=self.hierarchical_position,
+            embeddings_initializer=self.initializer,
+            name='Embedding-Position'
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Embedding-Norm'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Embedding-Mapping'
+            )
+
+        return x
+
+    def apply_main_layers(self, inputs, index):
+        """GPT2_ML的主体是基于Self-Attention的模块
+        顺序：Att  --> LN --> FFN --> Add --> LN
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+
+        attention_name = 'Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+
+        # Self Attention
+        xi, x, arguments = x, [x, x, x, attention_mask], {'a_bias': True}
+
+        x = self.apply(
+            inputs=x,
+            layer=MultiHeadAttention,
+            arguments=arguments,
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            attention_dropout=self.attention_dropout_rate,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm-0' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            epsilon=1e-5,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm-1' % feed_forward_name
+        )
+
+        return x
+
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        variable = super(GPT2_ML, self).load_variable(checkpoint, name)
+        if name == 'newslm/embeddings/word_embed':
+            return self.load_embeddings(variable)
+        else:
+            return variable
+
+    def variable_mapping(self):
+        """映射到官方GPT2_ML权重格式
+        """
+        mapping = {
+            'Embedding-Token': ['newslm/embeddings/word_embed'],
+            'Embedding-Position': ['newslm/embeddings/pos_embed'],
+            'Embedding-Norm': [
+                'newslm/embeddings/LayerNorm_embed_norm/beta',
+                'newslm/embeddings/LayerNorm_embed_norm/gamma',
+            ],
+        }
+
+        for i in range(self.num_hidden_layers):
+            prefix = 'newslm/layer%02d/' % i
+            mapping.update({
+                'Transformer-%d-MultiHeadSelfAttention' % i: [
+                    prefix + 'query_layer/kernel',
+                    prefix + 'query_layer/bias',
+                    prefix + 'key_layer/kernel',
+                    prefix + 'key_layer/bias',
+                    prefix + 'value_layer/kernel',
+                    prefix + 'value_layer/bias',
+                    prefix + 'context_projection_layer/kernel',
+                    prefix + 'context_projection_layer/bias',
+                ],
+                'Transformer-%d-FeedForward-Norm-0' % i: [
+                    prefix + 'LayerNorm_mlp_ln0/beta',
+                    prefix + 'LayerNorm_mlp_ln0/gamma',
+                ],
+                'Transformer-%d-FeedForward' % i: [
+                    prefix + 'intermediate/kernel',
+                    prefix + 'intermediate/bias',
+                    prefix + 'output/kernel',
+                    prefix + 'output/bias',
+                ],
+                'Transformer-%d-FeedForward-Norm-1' % i: [
+                    prefix + 'LayerNorm_mlp_ln1/beta',
+                    prefix + 'LayerNorm_mlp_ln1/gamma',
+                ],
+            })
+
+        return mapping
+
+
+class T5_Base(Transformer):
+    """Google的T5模型（基类）
+    注意T5有两个版本，一开始放出来的版本称为t5.1.0，而后来放出了一个升级
+    版本称为t5.1.1，两者结构略有不同，包括后来放出来的多国语言版T5也采用
+    了t5.1.1的结构。
+    t5.1.0: https://github.com/google-research/text-to-text-transfer-transformer
+    t5.1.1: https://github.com/google-research/text-to-text-transfer-transformer/blob/master/released_checkpoints.md#t511
+    multilingual-t5: https://github.com/google-research/multilingual-t5
+    """
+    @insert_arguments(version='t5.1.0')
+    def __init__(self, **kwargs):
+        super(T5_Base, self).__init__(**kwargs)
+
+    def load_variable(self, checkpoint, name):
+        """加载单个变量的函数
+        """
+        variable = super(T5_Base, self).load_variable(checkpoint, name)
+        if name == 'shared/embedding':
+            return self.load_embeddings(variable)
+        elif name == 'decoder/logits/kernel':
+            return self.load_embeddings(variable.T).T
+        elif 'relative_attention_bias' in name:
+            return variable.T
+        else:
+            return variable
+
+    def create_variable(self, name, value, dtype=None):
+        """在tensorflow中创建一个变量
+        """
+        if 'relative_attention_bias' in name:
+            value = value.T
+        return super(T5_Base, self).create_variable(name, value, dtype)
+
+    def variable_mapping(self):
+        """映射到官方T5权重格式
+        """
+        mapping = {
+            'Embedding-Token': ['shared/embedding'],
+            'Encoder-Embedding-Relative-Position': [
+                'encoder/block_000/layer_000/SelfAttention/relative_attention_bias'
+            ],
+            'Encoder-Output-Norm': ['encoder/final_layer_norm/scale'],
+            'Decoder-Embedding-Relative-Position': [
+                'decoder/block_000/layer_000/SelfAttention/relative_attention_bias',
+            ],
+            'Decoder-Output-Norm': ['decoder/final_layer_norm/scale'],
+        }
+
+        for i in range(self.num_hidden_layers):
+            # Encoder主体
+            prefix = 'encoder/block_%03d/' % i
+            mapping.update({
+                'Encoder-Transformer-%d-MultiHeadSelfAttention' % i: [
+                    prefix + 'layer_000/SelfAttention/q',
+                    prefix + 'layer_000/SelfAttention/k',
+                    prefix + 'layer_000/SelfAttention/v',
+                    prefix + 'layer_000/SelfAttention/o',
+                ],
+                'Encoder-Transformer-%d-MultiHeadSelfAttention-Norm' % i: [
+                    prefix + 'layer_000/layer_norm/scale',
+                ],
+                'Encoder-Transformer-%d-FeedForward' % i: [
+                    prefix + 'layer_001/DenseReluDense/wi/kernel',
+                    prefix + 'layer_001/DenseReluDense/wo/kernel',
+                ],
+                'Encoder-Transformer-%d-FeedForward-Norm' % i: [
+                    prefix + 'layer_001/layer_norm/scale',
+                ],
+            })
+            # Decoder主体
+            prefix = 'decoder/block_%03d/' % i
+            mapping.update({
+                'Decoder-Transformer-%d-MultiHeadSelfAttention' % i: [
+                    prefix + 'layer_000/SelfAttention/q',
+                    prefix + 'layer_000/SelfAttention/k',
+                    prefix + 'layer_000/SelfAttention/v',
+                    prefix + 'layer_000/SelfAttention/o',
+                ],
+                'Decoder-Transformer-%d-MultiHeadSelfAttention-Norm' % i: [
+                    prefix + 'layer_000/layer_norm/scale',
+                ],
+                'Decoder-Transformer-%d-MultiHeadCrossAttention' % i: [
+                    prefix + 'layer_001/EncDecAttention/q',
+                    prefix + 'layer_001/EncDecAttention/k',
+                    prefix + 'layer_001/EncDecAttention/v',
+                    prefix + 'layer_001/EncDecAttention/o',
+                ],
+                'Decoder-Transformer-%d-MultiHeadCrossAttention-Norm' % i: [
+                    prefix + 'layer_001/layer_norm/scale',
+                ],
+                'Decoder-Transformer-%d-FeedForward' % i: [
+                    prefix + 'layer_002/DenseReluDense/wi/kernel',
+                    prefix + 'layer_002/DenseReluDense/wo/kernel',
+                ],
+                'Decoder-Transformer-%d-FeedForward-Norm' % i: [
+                    prefix + 'layer_002/layer_norm/scale',
+                ],
+            })
+
+        if self.version.endswith('t5.1.1'):
+            mapping['Decoder-Output-LM'] = ['decoder/logits/kernel']
+            for i in range(self.num_hidden_layers):
+                for layer in [
+                    'Encoder-Transformer-%d-FeedForward' % i,
+                    'Decoder-Transformer-%d-FeedForward' % i
+                ]:
+                    mapping[layer] = [
+                        mapping[layer][0][:-7] + '_0' + mapping[layer][0][-7:],
+                        mapping[layer][0][:-7] + '_1' + mapping[layer][0][-7:],
+                        mapping[layer][1]
+                    ]
+            if self.version == 'mt5.1.1':
+                mapping['Encoder-Output-Norm'] = ['encoder/rms_norm/scale']
+                mapping['Decoder-Output-Norm'] = ['decoder/rms_norm/scale']
+                mapping = {
+                    k: [i.replace('layer_norm', 'rms_norm') for i in v]
+                    for k, v in mapping.items()
+                }
+
+        return mapping
+
+
+class T5_Encoder(T5_Base):
+    """Google的T5模型（Encoder）
+    """
+    def get_inputs(self):
+        """T5的Encoder的输入只有token_ids
+        """
+        x_in = self.apply(
+            layer=Input,
+            shape=(self.sequence_length,),
+            name='Encoder-Input-Token'
+        )
+        return x_in
+
+    def apply_embeddings(self, inputs):
+        """T5的embedding只有token embedding，
+        并把relative position embedding准备好，待attention使用。
+        """
+        x = inputs
+
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Encoder-Embedding-Dropout'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Encoder-Embedding-Mapping'
+            )
+
+        return x
+
+    def apply_main_layers(self, inputs, index):
+        """T5的Encoder的主体是基于Self-Attention的模块
+        顺序：LN --> Att --> Add --> LN --> FFN --> Add
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+
+        attention_name = 'Encoder-Transformer-%d-MultiHeadSelfAttention' % index
+        feed_forward_name = 'Encoder-Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        position_bias = self.compute_position_bias(x)
+
+        # Self Attention
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            zero_mean=False,
+            offset=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % attention_name
+        )
+        x = self.apply(
+            inputs=[x, x, x, position_bias],
+            layer=MultiHeadAttention,
+            arguments={'p_bias': 't5_relative'},
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            use_bias=False,
+            attention_scale=False,
+            attention_dropout=self.attention_dropout_rate,
+            kernel_initializer=self.initializer,
+            name=attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % attention_name
+        )
+
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            zero_mean=False,
+            offset=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            use_bias=False,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+
+        return x
+
+    def apply_final_layers(self, inputs):
+        """剩余部分
+        """
+        x = inputs
+        z = self.layer_norm_conds[0]
+
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            zero_mean=False,
+            offset=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Encoder-Output-Norm'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Encoder-Output-Dropout'
+        )
+
+        return x
+
+    def compute_position_bias(self, inputs=None):
+        """T5相对位置编码
+        """
+        if self.position_bias is None:
+
+            x = inputs
+            p = self.apply(
+                inputs=[x, x],
+                layer=RelativePositionEmbeddingT5,
+                input_dim=32,
+                output_dim=self.num_attention_heads,
+                bidirectional=True,
+                embeddings_initializer=self.initializer,
+                name='Encoder-Embedding-Relative-Position'
+            )
+            self.position_bias = p
+
+        return self.position_bias
+
+
+class T5_Decoder(LM_Mask, T5_Base):
+    """Google的T5模型（Decoder）
+    """
+    def __init__(self, with_lm=True, **kwargs):
+        super(T5_Decoder, self).__init__(**kwargs)
+        self.with_lm = with_lm
+
+    def get_inputs(self):
+        """T5的Decoder的输入为context序列和token_ids
+        """
+        c_in = self.apply(
+            layer=Input,
+            shape=(self.sequence_length, self.hidden_size),
+            name='Input-Context'
+        )
+        x_in = self.apply(
+            layer=Input,
+            shape=(self.sequence_length,),
+            name='Decoder-Input-Token'
+        )
+        return [c_in, x_in]
+
+    def apply_embeddings(self, inputs):
+        """T5的embedding只有token embedding，
+        并把relative position embedding准备好，待attention使用。
+        """
+        c, x = inputs
+
+        x = self.apply(
+            inputs=x,
+            layer=Embedding,
+            input_dim=self.vocab_size,
+            output_dim=self.embedding_size,
+            embeddings_initializer=self.initializer,
+            mask_zero=True,
+            name='Embedding-Token'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Decoder-Embedding-Dropout'
+        )
+        if self.embedding_size != self.hidden_size:
+            x = self.apply(
+                inputs=x,
+                layer=Dense,
+                units=self.hidden_size,
+                kernel_initializer=self.initializer,
+                name='Decoder-Embedding-Mapping'
+            )
+
+        return [c, x]
+
+    def apply_main_layers(self, inputs, index):
+        """T5的Decoder主体是基于Self-Attention、Cross-Attention的模块
+        顺序：LN --> Att1 --> Add --> LN --> Att2 --> Add -->  LN --> FFN --> Add
+        """
+        c, x = inputs
+        z = self.layer_norm_conds[0]
+
+        self_attention_name = 'Decoder-Transformer-%d-MultiHeadSelfAttention' % index
+        cross_attention_name = 'Decoder-Transformer-%d-MultiHeadCrossAttention' % index
+        feed_forward_name = 'Decoder-Transformer-%d-FeedForward' % index
+        attention_mask = self.compute_attention_bias(index)
+        position_bias = self.compute_position_bias([x, c])
+
+        # Self Attention
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            zero_mean=False,
+            offset=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % self_attention_name
+        )
+        x = self.apply(
+            inputs=[x, x, x, attention_mask, position_bias[0]],
+            layer=MultiHeadAttention,
+            arguments={
+                'a_bias': True,
+                'p_bias': 't5_relative'
+            },
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            use_bias=False,
+            attention_scale=False,
+            attention_dropout=self.attention_dropout_rate,
+            kernel_initializer=self.initializer,
+            name=self_attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % self_attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % self_attention_name
+        )
+
+        # Cross Attention
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            zero_mean=False,
+            offset=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % cross_attention_name
+        )
+        x = self.apply(
+            inputs=[x, c, c, position_bias[1]],
+            layer=MultiHeadAttention,
+            arguments={
+                'a_bias': None,
+                'p_bias': 't5_relative'
+            },
+            heads=self.num_attention_heads,
+            head_size=self.attention_head_size,
+            out_dim=self.hidden_size,
+            key_size=self.attention_key_size,
+            use_bias=False,
+            attention_scale=False,
+            attention_dropout=self.attention_dropout_rate,
+            kernel_initializer=self.initializer,
+            name=cross_attention_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % cross_attention_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % cross_attention_name
+        )
+
+        # Feed Forward
+        xi = x
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            zero_mean=False,
+            offset=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='%s-Norm' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=FeedForward,
+            units=self.intermediate_size,
+            activation=self.hidden_act,
+            use_bias=False,
+            kernel_initializer=self.initializer,
+            name=feed_forward_name
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='%s-Dropout' % feed_forward_name
+        )
+        x = self.apply(
+            inputs=[xi, x], layer=Add, name='%s-Add' % feed_forward_name
+        )
+
+        return [c, x]
+
+    def apply_final_layers(self, inputs):
+        """剩余部分
+        """
+        c, x = inputs
+        z = self.layer_norm_conds[0]
+
+        x = self.apply(
+            inputs=self.simplify([x, z]),
+            layer=LayerNormalization,
+            zero_mean=False,
+            offset=False,
+            epsilon=1e-6,
+            conditional=(z is not None),
+            hidden_units=self.layer_norm_conds[1],
+            hidden_activation=self.layer_norm_conds[2],
+            hidden_initializer=self.initializer,
+            name='Decoder-Output-Norm'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=Dropout,
+            rate=self.dropout_rate,
+            name='Decoder-Output-Dropout'
+        )
+        x = self.apply(
+            inputs=x,
+            layer=ScaleOffset,
+            scale=self.hidden_size**(-0.5),
+            offset=False,
+            name='Decoder-Output-Scale'
+        )
+
+        if self.with_lm:
+            # 预测token概率部分
+            if self.embedding_size != self.hidden_size:
+                x = self.apply(
+                    inputs=x,
+                    layer=Dense,
+                    units=self.embedding_size,
+                    kernel_initializer=self.initializer,
+                    name='Decoder-Output-Mapping'
+                )
+            lm_activation = 'softmax' if self.with_lm is True else self.with_lm
+            if self.version == 't5.1.0':
+                x = self.apply(
+                    inputs=x,
+                    layer=Embedding,
+                    arguments={'mode': 'dense'},
+                    name='Embedding-Token'
+                )
+                x = self.apply(
+                    inputs=x,
+                    layer=Activation,
+                    activation=lm_activation,
+                    name='Decoder-Output-LM-Activation'
+                )
+            else:
+                x = self.apply(
+                    inputs=x,
+                    layer=Dense,
+                    units=self.vocab_size,
+                    activation=lm_activation,
+                    use_bias=False,
+                    kernel_initializer=self.initializer,
+                    name='Decoder-Output-LM'
+                )
+
+        return x
+
+    def compute_attention_bias(self, inputs=None):
+        """修改LM Mask的序列长度（从 self.inputs[0] 改为 self.inputs[1] ）
+        """
+        old_inputs = self.inputs[:]
+        self.inputs = [old_inputs[1]]
+        mask = super(T5_Decoder, self).compute_attention_bias(inputs)
+        self.inputs = old_inputs
+        return mask
+
+    def compute_position_bias(self, inputs=None):
+        """T5相对位置编码
+        """
+        if self.position_bias is None:
+
+            x, c = inputs
+            p1 = self.apply(
+                inputs=[x, x],
+                layer=RelativePositionEmbeddingT5,
+                input_dim=32,
+                output_dim=self.num_attention_heads,
+                bidirectional=False,
+                embeddings_initializer=self.initializer,
+                name='Decoder-Embedding-Relative-Position'
+            )
+            p2 = self.apply(
+                inputs=[x, c],
+                layer=RelativePositionEmbeddingT5,
+                input_dim=32,
+                output_dim=self.num_attention_heads,
+                bidirectional=False,
+                embeddings_initializer=self.initializer,
+                name='Decoder-Embedding-Relative-Position'
+            )
+            self.position_bias = (p1, p2)
+
+        return self.position_bias
+
+
+class T5(T5_Base):
+    """Google的T5模型（Encoder-Decoder）
+    """
+    def __init__(self, **kwargs):
+        super(T5, self).__init__(**kwargs)
+        kwargs['layers'] = self.layers
+        e_name, d_name = 'Encoder', 'Decoder'
+        if 'name' in kwargs:
+            e_name = '%s_%s' % (kwargs['name'], e_name)
+            d_name = '%s_%s' % (kwargs['name'], d_name)
+            del kwargs['name']  # 防止重复传参
+        self._encoder = T5_Encoder(name=e_name, **kwargs)
+        self._decoder = T5_Decoder(name=d_name, **kwargs)
+
+    def build(self, **kwargs):
+        """同时构建Encoder和Decoder
+        """
+        self._encoder.build(**kwargs)
+        self._decoder.build(**kwargs)
+        self._decoder.position_bias = None  # 下面call时将重新初始化
+        self.encoder = self._encoder.model
+        self.decoder = self._decoder.model
+        self.inputs = self.encoder.inputs + self.decoder.inputs[1:]
+        self.outputs = self._decoder.call(
+            self.encoder.outputs + self.decoder.inputs[1:]
+        )
+        self.model = Model(self.inputs, self.outputs)
+
+
+def extend_with_language_model(BaseModel):
+    """添加下三角的Attention Mask（语言模型用）
+    """
+    class LanguageModel(LM_Mask, BaseModel):
+        """带下三角Attention Mask的派生模型
+        """
+        def __init__(self, *args, **kwargs):
+            super(LanguageModel, self).__init__(*args, **kwargs)
+            self.with_mlm = self.with_mlm or True
+
+    return LanguageModel
+
+
+def extend_with_unified_language_model(BaseModel):
+    """添加UniLM的Attention Mask（Seq2Seq模型用）
+    """
+    class UnifiedLanguageModel(UniLM_Mask, BaseModel):
+        """带UniLM的Attention Mask的派生模型
+        UniLM: https://arxiv.org/abs/1905.03197
+        """
+        def __init__(self, *args, **kwargs):
+            super(UnifiedLanguageModel, self).__init__(*args, **kwargs)
+            self.with_mlm = self.with_mlm or True
+
+    return UnifiedLanguageModel
+
+
+def data_parallel(model, devices=None, parts=None):
+    """通过数据并行来实现模型并行
+    参数：
+        devices：运行设备，默认为所有可用GPU；
+        parts：batch_size分配，默认为均匀划分；
+    """
+    if devices is None:
+        devices = get_available_gpus()
+    elif isinstance(devices, int):
+        devices = ['/device:GPU:%d' % i for i in range(devices)]
+
+    if parts is None:
+        parts = len(devices)
+    else:
+        assert len(devices) == len(parts)
+
+    splited_inputs = BatchSplit(parts)(model.inputs)
+    splited_outputs = [[] for _ in model.outputs]
+    for i, device in enumerate(devices):
+        with tf.device(device):
+            outputs = model(splited_inputs[i::len(devices)])
+            outputs = outputs if isinstance(outputs, list) else [outputs]
+            for j, output in enumerate(outputs):
+                splited_outputs[j].append(output)
+
+    outputs = [BatchConcat()(outputs) for outputs in splited_outputs]
+
+    return Model(model.inputs, outputs)
+
+
+def build_transformer_model(
+    config_path=None,
+    checkpoint_path=None,
+    model='bert',
+    application='encoder',
+    return_keras_model=True,
+    **kwargs
+):
+    """根据配置文件构建模型，可选加载checkpoint权重
+    """
+    configs = {}
+    if config_path is not None:
+        configs.update(json.load(open(config_path)))
+    configs.update(kwargs)
+    if 'max_position' not in configs:
+        configs['max_position'] = configs.get('max_position_embeddings', 512)
+    if 'dropout_rate' not in configs:
+        configs['dropout_rate'] = configs.get('hidden_dropout_prob')
+    if 'attention_dropout_rate' not in configs:
+        configs['attention_dropout_rate'] = configs.get(
+            'attention_probs_dropout_prob'
+        )
+    if 'segment_vocab_size' not in configs:
+        configs['segment_vocab_size'] = configs.get('type_vocab_size', 2)
+
+    models = {
+        'bert': BERT,
+        'albert': ALBERT,
+        'albert_unshared': ALBERT_Unshared,
+        'roberta': BERT,
+        'nezha': NEZHA,
+        'roformer': RoFormer,
+        'roformer_v2': RoFormerV2,
+        'electra': ELECTRA,
+        'gpt': GPT,
+        'gpt2': GPT2,
+        'gpt2_ml': GPT2_ML,
+        't5': T5,
+        't5_encoder': T5_Encoder,
+        't5_decoder': T5_Decoder,
+        't5.1.0': T5,
+        't5.1.0_encoder': T5_Encoder,
+        't5.1.0_decoder': T5_Decoder,
+        't5.1.1': T5,
+        't5.1.1_encoder': T5_Encoder,
+        't5.1.1_decoder': T5_Decoder,
+        'mt5.1.1': T5,
+        'mt5.1.1_encoder': T5_Encoder,
+        'mt5.1.1_decoder': T5_Decoder,
+    }
+
+    if is_string(model):
+        model = model.lower()
+        MODEL = models[model]
+        if model.startswith('t5.1.1'):
+            configs['version'] = 't5.1.1'
+        elif model.startswith('mt5.1.1'):
+            configs['version'] = 'mt5.1.1'
+    else:
+        MODEL = model
+
+    application = application.lower()
+    if application in ['lm', 'unilm'] and model in ['electra', 't5']:
+        raise ValueError(
+            '"%s" model can not be used as "%s" application.\n' %
+            (model, application)
+        )
+
+    if application == 'lm':
+        MODEL = extend_with_language_model(MODEL)
+    elif application == 'unilm':
+        MODEL = extend_with_unified_language_model(MODEL)
+
+    transformer = MODEL(**configs)
+    transformer.build(**configs)
+
+    if checkpoint_path is not None:
+        transformer.load_weights_from_checkpoint(checkpoint_path)
+
+    if return_keras_model:
+        return transformer.model
+    else:
+        return transformer
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/optimizers.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/optimizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c3dcf227aa5837065c2b0f441c21ccf191b744f
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/optimizers.py
@@ -0,0 +1,1170 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# -*- coding: utf-8 -*-
+# 优化相关
+
+import numpy as np
+import tensorflow as tf
+from bert4keras.backend import keras, K, is_tf_keras
+from bert4keras.snippets import is_string, string_matching
+from bert4keras.snippets import is_one_of, insert_arguments
+from bert4keras.backend import piecewise_linear
+from bert4keras.backend import root_mean_square as rms
+import re
+
+
+class Adam(keras.optimizers.Optimizer):
+    """重新定义Adam优化器，便于派生出新的优化器
+    （tensorflow的optimizer_v2类）
+    """
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        bias_correction=True,
+        **kwargs
+    ):
+        kwargs['name'] = kwargs.get('name') or 'Adam'
+        super(Adam, self).__init__(**kwargs)
+        self._set_hyper('learning_rate', learning_rate)
+        self._set_hyper('beta_1', beta_1)
+        self._set_hyper('beta_2', beta_2)
+        self.epsilon = epsilon or K.epislon()
+        self.bias_correction = bias_correction
+
+    def _create_slots(self, var_list):
+        for var in var_list:
+            self.add_slot(var, 'm')
+            self.add_slot(var, 'v')
+
+    def _resource_apply(self, grad, var, indices=None):
+        # 准备变量
+        var_dtype = var.dtype.base_dtype
+        lr_t = self._decayed_lr(var_dtype)
+        m = self.get_slot(var, 'm')
+        v = self.get_slot(var, 'v')
+        beta_1_t = self._get_hyper('beta_1', var_dtype)
+        beta_2_t = self._get_hyper('beta_2', var_dtype)
+        epsilon_t = K.cast(self.epsilon, var_dtype)
+        local_step = K.cast(self.iterations + 1, var_dtype)
+        beta_1_t_power = K.pow(beta_1_t, local_step)
+        beta_2_t_power = K.pow(beta_2_t, local_step)
+
+        # 更新公式
+        if indices is None:
+            m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad)
+            v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * K.square(grad))
+        else:
+            mv_ops = [K.update(m, beta_1_t * m), K.update(v, beta_2_t * v)]
+            with tf.control_dependencies(mv_ops):
+                m_t = self._resource_scatter_add(
+                    m, indices, (1 - beta_1_t) * grad
+                )
+                v_t = self._resource_scatter_add(
+                    v, indices, (1 - beta_2_t) * K.square(grad)
+                )
+
+        # 返回算子
+        with tf.control_dependencies([m_t, v_t]):
+            if self.bias_correction:
+                m_t = m_t / (1.0 - beta_1_t_power)
+                v_t = v_t / (1.0 - beta_2_t_power)
+            var_t = var - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
+            return K.update(var, var_t)
+
+    def _resource_apply_dense(self, grad, var):
+        return self._resource_apply(grad, var)
+
+    def _resource_apply_sparse(self, grad, var, indices):
+        return self._resource_apply(grad, var, indices)
+
+    def get_config(self):
+        config = {
+            'learning_rate': self._serialize_hyperparameter('learning_rate'),
+            'beta_1': self._serialize_hyperparameter('beta_1'),
+            'beta_2': self._serialize_hyperparameter('beta_2'),
+            'epsilon': self.epsilon,
+            'bias_correction': self.bias_correction,
+        }
+        base_config = super(Adam, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class AdaFactorBase(keras.optimizers.Optimizer):
+    """AdaFactor优化器（基类）
+    论文链接：https://arxiv.org/abs/1804.04235
+    参考实现：https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/optimize.py
+    """
+    def __init__(
+        self,
+        learning_rate=1e-3,  # 可以为None
+        beta1=0.0,
+        beta2=None,
+        epsilon1=1e-30,
+        epsilon2=1e-3,
+        multiply_by_parameter_scale=True,
+        clipping_threshold=1.0,
+        min_dim_size_to_factor=128,
+        exclude_from_parameter_scale=None,
+        **kwargs
+    ):
+        super(AdaFactorBase, self).__init__(**kwargs)
+        self._learning_rate = learning_rate
+        self.beta1 = beta1
+        self._beta2 = beta2
+        self.epsilon1 = epsilon1
+        self.epsilon2 = epsilon2
+        self.multiply_by_parameter_scale = multiply_by_parameter_scale
+        self.clipping_threshold = clipping_threshold
+        self.min_dim_size_to_factor = min_dim_size_to_factor
+        self.exclude_from_parameter_scale = exclude_from_parameter_scale or []
+
+    @property
+    def learning_rate(self):
+        if self._learning_rate is None:
+            iterations = K.cast(self.iterations + 1, K.floatx())
+            learning_rate = K.minimum(1.0 / K.sqrt(iterations), 0.01)
+            if self.multiply_by_parameter_scale:
+                return learning_rate
+            else:
+                return learning_rate * 0.05
+        else:
+            if not hasattr(self, '__learning_rate'):
+                with K.name_scope(self.__class__.__name__):
+                    self.__learning_rate = K.variable(
+                        self._learning_rate, name='learning_rate'
+                    )
+            return self.__learning_rate
+
+    @property
+    def beta2(self):
+        if self._beta2 is None:
+            iterations = K.cast(self.iterations + 1, K.floatx())
+            return 1.0 - K.pow(iterations, -0.8)
+        else:
+            return self._beta2
+
+    def factored_shape(self, shape):
+        if len(shape) < 2:
+            return None
+        shape = np.array(shape)
+        indices = shape.argpartition(-2)
+        if shape[indices[-2]] < self.min_dim_size_to_factor:
+            return None
+        shape1, shape2 = np.array(shape), np.array(shape)
+        shape1[indices[-1]] = 1
+        shape2[indices[-2]] = 1
+        return shape1, indices[-1], shape2, indices[-2]
+
+    def _do_parameter_scale(self, w):
+        return self.multiply_by_parameter_scale and (
+            not string_matching(w.name, self.exclude_from_parameter_scale)
+        )
+
+    def get_config(self):
+        config = {
+            'learning_rate': self._learning_rate,
+            'beta1': self.beta1,
+            'beta2': self._beta2,
+            'epsilon1': self.epsilon1,
+            'epsilon2': self.epsilon2,
+            'multiply_by_parameter_scale': self.multiply_by_parameter_scale,
+            'clipping_threshold': self.clipping_threshold,
+            'min_dim_size_to_factor': self.min_dim_size_to_factor,
+            'exclude_from_parameter_scale': self.exclude_from_parameter_scale,
+        }
+        base_config = super(AdaFactorBase, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+
+class AdaFactorV1(AdaFactorBase):
+    """AdaFactor优化器（纯Keras版）
+    论文链接：https://arxiv.org/abs/1804.04235
+    参考实现：https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/optimize.py
+    """
+    def __init__(self, *args, **kwargs):
+        super(AdaFactorV1, self).__init__(*args, **kwargs)
+        with K.name_scope(self.__class__.__name__):
+            self.iterations = K.variable(0, dtype='int64', name='iterations')
+
+    @K.symbolic
+    def get_updates(self, loss, params):
+        grads = self.get_gradients(loss, params)
+        self.updates = [K.update_add(self.iterations, 1)]
+        self.weights = [self.iterations]
+        lr = self.learning_rate
+
+        for i, (p, g) in enumerate(zip(params, grads)):
+            g2 = K.square(g) + self.epsilon1  # 如果换成g**2，在keras下Embedding层会报错
+            shape, dtype = K.int_shape(p), K.dtype(p)
+            factored_shape = self.factored_shape(shape)
+            if factored_shape is None:
+                # 定义参数
+                v = K.zeros(shape, dtype=dtype, name='v_' + str(i))
+                self.weights.append(v)
+                # 定义更新
+                v_t = self.beta2 * v + (1.0 - self.beta2) * g2
+                self.updates.append(K.update(v, v_t))
+            else:
+                # 定义参数
+                shape1, axis1, shape2, axis2 = factored_shape
+                vr = K.zeros(shape1, dtype=dtype, name='vr_' + str(i))
+                vc = K.zeros(shape2, dtype=dtype, name='vc_' + str(i))
+                self.weights.extend([vr, vc])
+                # 定义更新
+                g2r = K.mean(g2, axis=axis1, keepdims=True)
+                g2c = K.mean(g2, axis=axis2, keepdims=True)
+                vr_t = self.beta2 * vr + (1.0 - self.beta2) * g2r
+                vc_t = self.beta2 * vc + (1.0 - self.beta2) * g2c
+                self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)])
+                # 合成矩阵
+                v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True)
+            # 增量主体
+            u = g / K.sqrt(v_t + self.epsilon1)
+            # 增量裁剪
+            if self.clipping_threshold is not None:
+                u = u / K.maximum(1.0, rms(u) / self.clipping_threshold)
+            # 增量滑动
+            if self.beta1 > 0.0:
+                # 定义参数
+                m = K.zeros(shape, dtype=dtype, name='m_' + str(i))
+                self.weights.append(m)
+                # 定义更新
+                m_t = self.beta1 * m + (1.0 - self.beta1) * u
+                self.updates.append(K.update(m, m_t))
+                u = m_t
+            # 增量调整
+            if self._do_parameter_scale(p):
+                u = u * K.maximum(rms(p), self.epsilon2)
+            # 更新参数
+            self.updates.append(K.update(p, p - lr * u))
+
+        return self.updates
+
+
+class AdaFactorV2(AdaFactorBase):
+    """AdaFactor优化器（tf.keras版）
+    论文链接：https://arxiv.org/abs/1804.04235
+    参考实现：https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/optimize.py
+    """
+    def __init__(self, *args, **kwargs):
+        kwargs['name'] = kwargs.get('name') or 'AdaFactor'
+        super(AdaFactorV2, self).__init__(*args, **kwargs)
+
+    def _create_slots(self, var_list):
+        for var in var_list:
+            if self.beta1 > 0.0:
+                self.add_slot(var, 'm')
+            shape = K.int_shape(var)
+            factored_shape = self.factored_shape(shape)
+            if factored_shape is None:
+                self.add_slot(var, 'v')
+            else:
+                shape1, axis1, shape2, axis2 = factored_shape
+                value1, value2 = np.zeros(shape1), np.zeros(shape2)
+                self.add_slot(var, 'vr', value1)
+                self.add_slot(var, 'vc', value2)
+
+    def _decayed_lr(self, var_dtype):
+        return self.learning_rate
+
+    def _resource_apply(self, grad, var, indices=None):
+        lr = self._decayed_lr(var.dtype.base_dtype)
+        g2 = K.square(grad) + self.epsilon1
+        shape = K.int_shape(var)
+        factored_shape = self.factored_shape(shape)
+        if factored_shape is None:
+            v = self.get_slot(var, 'v')
+            # 定义更新
+            v_t = self.beta2 * v + (1.0 - self.beta2) * g2
+            v_t = K.update(v, v_t)
+        else:
+            shape1, axis1, shape2, axis2 = factored_shape
+            vr = self.get_slot(var, 'vr')
+            vc = self.get_slot(var, 'vc')
+            # 定义更新
+            g2r = K.mean(g2, axis=axis1, keepdims=True)
+            g2c = K.mean(g2, axis=axis2, keepdims=True)
+            vr_t = self.beta2 * vr + (1.0 - self.beta2) * g2r
+            vc_t = self.beta2 * vc + (1.0 - self.beta2) * g2c
+            vr_t, vc_t = K.update(vr, vr_t), K.update(vc, vc_t)
+            # 合成矩阵
+            v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True)
+        # 增量主体
+        u = grad / K.sqrt(v_t + self.epsilon1)
+        # 增量裁剪
+        if self.clipping_threshold is not None:
+            u = u / K.maximum(1.0, rms(u) / self.clipping_threshold)
+        # 增量滑动
+        if self.beta1 > 0.0:
+            m = self.get_slot(var, 'm')
+            # 定义更新
+            m_t = self.beta1 * m + (1.0 - self.beta1) * u
+            u = K.update(m, m_t)
+        # 增量调整
+        if self._do_parameter_scale(var):
+            u = u * K.maximum(rms(var), self.epsilon2)
+        # 更新参数
+        return K.update(var, var - lr * u)
+
+    def _resource_apply_dense(self, grad, var):
+        return self._resource_apply(grad, var)
+
+    def _resource_apply_sparse(self, grad, var, indices):
+        grad = tf.IndexedSlices(grad, indices, K.shape(var))
+        grad = tf.convert_to_tensor(grad)
+        return self._resource_apply_dense(grad, var)
+
+
+def export_to_custom_objects(base_extend_with):
+    """装饰器，用来将优化器放到custom_objects中
+    """
+    def new_extend_with(BaseOptimizer, name=None):
+        NewOptimizer = base_extend_with(BaseOptimizer)
+
+        if is_string(name):
+            NewOptimizer.__name__ = name
+
+        name = NewOptimizer.__name__
+        keras.utils.get_custom_objects()[name] = NewOptimizer
+
+        return NewOptimizer
+
+    return new_extend_with
+
+
+@export_to_custom_objects
+def extend_with_weight_decay(BaseOptimizer):
+    """返回新的优化器类，加入权重衰减
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有权重衰减的优化器
+        """
+        @insert_arguments(weight_decay_rate=0.01, exclude_from_weight_decay=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            if not hasattr(self, 'learning_rate'):
+                self.learning_rate = self.lr
+
+        @K.symbolic
+        def get_updates(self, loss, params):
+            old_update = K.update
+
+            def new_update(x, new_x):
+                if is_one_of(x, params) and self._do_weight_decay(x):
+                    new_x = new_x - self.learning_rate * self.weight_decay_rate * x
+                return old_update(x, new_x)
+
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+
+            return updates
+
+        def _do_weight_decay(self, w):
+            return (not string_matching(w.name, self.exclude_from_weight_decay))
+
+        def get_config(self):
+            config = {
+                'weight_decay_rate': self.weight_decay_rate,
+                'exclude_from_weight_decay': self.exclude_from_weight_decay,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_weight_decay_v2(BaseOptimizer):
+    """返回新的优化器类，加入权重衰减
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有权重衰减的优化器
+        """
+        @insert_arguments(weight_decay_rate=0.01, exclude_from_weight_decay=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+
+        def _resource_apply(self, grad, var, indices=None):
+            old_update = K.update
+
+            def new_update(x, new_x):
+                if x is var and self._do_weight_decay(x):
+                    lr_t = self._decayed_lr(x.dtype.base_dtype)
+                    new_x = new_x - lr_t * self.weight_decay_rate * x
+                return old_update(x, new_x)
+
+            K.update = new_update
+            op = super(NewOptimizer, self)._resource_apply(grad, var, indices)
+            K.update = old_update
+
+            return op
+
+        def _do_weight_decay(self, w):
+            return (not string_matching(w.name, self.exclude_from_weight_decay))
+
+        def get_config(self):
+            config = {
+                'weight_decay_rate': self.weight_decay_rate,
+                'exclude_from_weight_decay': self.exclude_from_weight_decay,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_layer_adaptation(BaseOptimizer):
+    """返回新的优化器类，加入层自适应学习率
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有层自适应学习率的优化器
+        用每一层参数的模长来校正当前参数的学习率
+        https://arxiv.org/abs/1904.00962
+        """
+        @insert_arguments(exclude_from_layer_adaptation=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            if not hasattr(self, 'learning_rate'):
+                self.learning_rate = self.lr
+
+        @K.symbolic
+        def get_updates(self, loss, params):
+            old_update = K.update
+
+            def new_update(x, new_x):
+                if is_one_of(x, params) and self._do_layer_adaptation(x):
+                    dx = new_x - x
+                    lr_t = K.clip(self.learning_rate, K.epsilon(), K.infinity())
+                    x_norm = tf.norm(x)
+                    g_norm = tf.norm(dx / lr_t)
+                    ratio = K.switch(
+                        x_norm > 0.0,
+                        K.switch(g_norm > 0.0, x_norm / g_norm, 1.0), 1.0
+                    )
+                    new_x = x + dx * ratio
+                return old_update(x, new_x)
+
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+
+            return updates
+
+        def _do_layer_adaptation(self, w):
+            return (
+                not string_matching(w.name, self.exclude_from_layer_adaptation)
+            )
+
+        def get_config(self):
+            config = {
+                'exclude_from_layer_adaptation':
+                    self.exclude_from_layer_adaptation,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_layer_adaptation_v2(BaseOptimizer):
+    """返回新的优化器类，加入层自适应学习率
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有层自适应学习率的优化器
+        用每一层参数的模长来校正当前参数的学习率
+        https://arxiv.org/abs/1904.00962
+        """
+        @insert_arguments(exclude_from_layer_adaptation=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+
+        def _resource_apply(self, grad, var, indices=None):
+            old_update = K.update
+
+            def new_update(x, new_x):
+                if x is var and self._do_layer_adaptation(x):
+                    dx = new_x - x
+                    lr_t = self._decayed_lr(x.dtype.base_dtype)
+                    lr_t = K.clip(lr_t, K.epsilon(), K.infinity())
+                    x_norm = tf.norm(x)
+                    g_norm = tf.norm(dx / lr_t)
+                    ratio = K.switch(
+                        x_norm > 0.0,
+                        K.switch(g_norm > 0.0, x_norm / g_norm, 1.0), 1.0
+                    )
+                    new_x = x + dx * ratio
+                return old_update(x, new_x)
+
+            K.update = new_update
+            op = super(NewOptimizer, self)._resource_apply(grad, var, indices)
+            K.update = old_update
+
+            return op
+
+        def _do_layer_adaptation(self, w):
+            return (
+                not string_matching(w.name, self.exclude_from_layer_adaptation)
+            )
+
+        def get_config(self):
+            config = {
+                'exclude_from_layer_adaptation':
+                    self.exclude_from_layer_adaptation,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_piecewise_linear_lr(BaseOptimizer):
+    """返回新的优化器类，加入分段线性学习率
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有分段线性学习率的优化器
+        其中schedule是形如{1000: 1, 2000: 0.1}的字典，
+        表示0～1000步内学习率线性地从零增加到100%，然后
+        1000～2000步内线性地降到10%，2000步以后保持10%
+        """
+        @insert_arguments(lr_schedule={0: 1})
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            self.lr_schedule = {int(i): j for i, j in self.lr_schedule.items()}
+
+        @K.symbolic
+        def get_updates(self, loss, params):
+            lr_multiplier = piecewise_linear(self.iterations, self.lr_schedule)
+
+            old_update = K.update
+
+            def new_update(x, new_x):
+                if is_one_of(x, params):
+                    new_x = x + (new_x - x) * lr_multiplier
+                return old_update(x, new_x)
+
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+
+            return updates
+
+        def get_config(self):
+            config = {
+                'lr_schedule': self.lr_schedule,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_piecewise_linear_lr_v2(BaseOptimizer):
+    """返回新的优化器类，加入分段线性学习率
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有分段线性学习率的优化器
+        其中schedule是形如{1000: 1, 2000: 0.1}的字典，
+        表示0～1000步内学习率线性地从零增加到100%，然后
+        1000～2000步内线性地降到10%，2000步以后保持10%
+        """
+        @insert_arguments(lr_schedule={0: 1})
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            self.lr_schedule = {int(i): j for i, j in self.lr_schedule.items()}
+
+        def _decayed_lr(self, var_dtype):
+            lr_multiplier = piecewise_linear(self.iterations, self.lr_schedule)
+            lr_t = super(NewOptimizer, self)._decayed_lr(var_dtype)
+            return lr_t * K.cast(lr_multiplier, var_dtype)
+
+        def get_config(self):
+            config = {
+                'lr_schedule': self.lr_schedule,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_gradient_accumulation(BaseOptimizer):
+    """返回新的优化器类，加入梯度累积
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有梯度累积的优化器
+        """
+        @insert_arguments(grad_accum_steps=2)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            self.accum_grads = {}
+
+        def get_gradients(self, loss, params):
+            accum_grads = []
+            for p in params:
+                if p not in self.accum_grads:
+                    self.accum_grads[p] = K.zeros(
+                        K.int_shape(p), dtype=K.dtype(p)
+                    )
+                accum_grads.append(self.accum_grads[p])
+
+            return [ag / self.grad_accum_steps for ag in accum_grads]
+
+        @K.symbolic
+        def get_updates(self, loss, params):
+            # 更新判据
+            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
+            cond = K.cast(cond, K.floatx())
+
+            old_update = K.update
+
+            def new_update(x, new_x):
+                new_x = cond * new_x + (1 - cond) * x
+                return old_update(x, new_x)
+
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+
+            # 获取梯度
+            grads = super(NewOptimizer, self).get_gradients(loss, params)
+            accum_grads = [self.accum_grads[p] for p in params]
+            # 累积梯度
+            with tf.control_dependencies(updates):
+                accum_updates = [
+                    K.update(ag, g + (1 - cond) * ag)
+                    for g, ag in zip(grads, accum_grads)
+                ]
+
+            return accum_updates
+
+        def get_config(self):
+            config = {
+                'grad_accum_steps': self.grad_accum_steps,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_gradient_accumulation_v2(BaseOptimizer):
+    """返回新的优化器类，加入梯度累积
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有梯度累积的优化器
+        """
+        @insert_arguments(grad_accum_steps=2)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+
+        def _create_slots(self, var_list):
+            super(NewOptimizer, self)._create_slots(var_list)
+            for var in var_list:
+                self.add_slot(var, 'ag')
+
+        def _resource_apply(self, grad, var, indices=None):
+            # 更新判据
+            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
+            # 获取梯度
+            ag = self.get_slot(var, 'ag')
+
+            old_update = K.update
+
+            def new_update(x, new_x):
+                new_x = K.switch(cond, new_x, x)
+                return old_update(x, new_x)
+
+            K.update = new_update
+            ag_t = ag / self.grad_accum_steps
+            op = super(NewOptimizer, self)._resource_apply(ag_t, var)
+            K.update = old_update
+
+            # 累积梯度
+            with tf.control_dependencies([op]):
+                ag_t = K.switch(cond, K.zeros_like(ag), ag)
+                with tf.control_dependencies([K.update(ag, ag_t)]):
+                    if indices is None:
+                        ag_t = K.update(ag, ag + grad)
+                    else:
+                        ag_t = self._resource_scatter_add(ag, indices, grad)
+
+            return ag_t
+
+        def get_config(self):
+            config = {
+                'grad_accum_steps': self.grad_accum_steps,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_lookahead(BaseOptimizer):
+    """返回新的优化器类，加入look ahead
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有look ahead的优化器
+        https://arxiv.org/abs/1907.08610
+        steps_per_slow_update: 即论文中的k；
+        slow_step_size: 即论文中的alpha。
+        """
+        @insert_arguments(steps_per_slow_update=5, slow_step_size=0.5)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+
+        @K.symbolic
+        def get_updates(self, loss, params):
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+
+            k, alpha = self.steps_per_slow_update, self.slow_step_size
+            cond = K.equal(self.iterations % k, 0)
+            slow_vars = [
+                K.zeros(
+                    K.int_shape(p), dtype=K.dtype(p), name='slow_var_%s' % i
+                ) for i, p in enumerate(params)
+            ]
+
+            with tf.control_dependencies(updates):
+                slow_updates = [
+                    K.update(q, K.switch(cond, q + alpha * (p - q), q))
+                    for p, q in zip(params, slow_vars)
+                ]
+                with tf.control_dependencies(slow_updates):
+                    copy_updates = [
+                        K.update(p, K.switch(cond, q, p))
+                        for p, q in zip(params, slow_vars)
+                    ]
+
+            return copy_updates
+
+        def get_config(self):
+            config = {
+                'steps_per_slow_update': self.steps_per_slow_update,
+                'slow_step_size': self.slow_step_size,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_lookahead_v2(BaseOptimizer):
+    """返回新的优化器类，加入look ahead
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有look ahead的优化器
+        https://arxiv.org/abs/1907.08610
+        steps_per_slow_update: 即论文中的k；
+        slow_step_size: 即论文中的alpha。
+        """
+        @insert_arguments(steps_per_slow_update=5, slow_step_size=0.5)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+
+        def _create_slots(self, var_list):
+            super(NewOptimizer, self)._create_slots(var_list)
+            for var in var_list:
+                self.add_slot(var, 'slow_var')
+
+        def _resource_apply(self, grad, var, indices=None):
+            op = super(NewOptimizer, self)._resource_apply(grad, var, indices)
+
+            k, alpha = self.steps_per_slow_update, self.slow_step_size
+            cond = K.equal(self.iterations % k, 0)
+            slow_var = self.get_slot(var, 'slow_var')
+            slow_var_t = slow_var + alpha * (var - slow_var)
+
+            with tf.control_dependencies([op]):
+                slow_update = K.update(
+                    slow_var, K.switch(cond, slow_var_t, slow_var)
+                )
+                with tf.control_dependencies([slow_update]):
+                    copy_update = K.update(var, K.switch(cond, slow_var, var))
+
+            return copy_update
+
+        def get_config(self):
+            config = {
+                'steps_per_slow_update': self.steps_per_slow_update,
+                'slow_step_size': self.slow_step_size,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_lazy_optimization(BaseOptimizer):
+    """返回新的优化器类，加入懒惰更新
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有懒惰更新的优化器
+        使得部分权重（尤其是embedding）只有在梯度不等于0时
+        才发生更新。
+        """
+        @insert_arguments(include_in_lazy_optimization=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+            self._first_get_gradients = True
+
+        def get_gradients(self, loss, params):
+            if self._first_get_gradients:
+                self._first_get_gradients = False
+                return super(NewOptimizer, self).get_gradients(loss, params)
+            else:
+                return [self.grads[p] for p in params]
+
+        @K.symbolic
+        def get_updates(self, loss, params):
+            self.grads = dict(zip(params, self.get_gradients(loss, params)))
+
+            old_update = K.update
+
+            def new_update(x, new_x):
+                if is_one_of(x, params) and self._do_lazy_optimization(x):
+                    g = self.grads[x]
+                    r = K.any(K.not_equal(g, 0.0), axis=-1, keepdims=True)
+                    new_x = x + (new_x - x) * K.cast(r, K.floatx())
+                return old_update(x, new_x)
+
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+
+            return updates
+
+        def _do_lazy_optimization(self, w):
+            return string_matching(w.name, self.include_in_lazy_optimization)
+
+        def get_config(self):
+            config = {
+                'include_in_lazy_optimization':
+                    self.include_in_lazy_optimization,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_lazy_optimization_v2(BaseOptimizer):
+    """返回新的优化器类，加入懒惰更新
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有懒惰更新的优化器
+        使得部分权重（尤其是embedding）只有在梯度不等于0时
+        才发生更新。
+        """
+        @insert_arguments(include_in_lazy_optimization=[])
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+
+        def _resource_apply(self, grad, var, indices=None):
+            old_update = K.update
+
+            def new_update(x, new_x):
+                if x is var and self._do_lazy_optimization(x):
+                    if indices is None:
+                        r = K.any(
+                            K.not_equal(grad, 0.0), axis=-1, keepdims=True
+                        )
+                        new_x = x + (new_x - x) * K.cast(r, K.floatx())
+                        return old_update(x, new_x)
+                    else:
+                        return self._resource_scatter_add(
+                            x, indices, K.gather(new_x - x, indices)
+                        )
+                return old_update(x, new_x)
+
+            K.update = new_update
+            op = super(NewOptimizer, self)._resource_apply(grad, var, indices)
+            K.update = old_update
+
+            return op
+
+        def _do_lazy_optimization(self, w):
+            return string_matching(w.name, self.include_in_lazy_optimization)
+
+        def get_config(self):
+            config = {
+                'include_in_lazy_optimization':
+                    self.include_in_lazy_optimization,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_exponential_moving_average(BaseOptimizer):
+    """返回新的优化器类，加入EMA（权重滑动平均）
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带EMA（权重滑动平均）的优化器
+        """
+        @insert_arguments(ema_momentum=0.999)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+
+        def get_updates(self, loss, params):
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            self.model_weights = params
+            self.ema_weights = [K.zeros(K.shape(w)) for w in params]
+            self.old_weights = K.batch_get_value(params)
+
+            ema_updates, ema_momentum = [], self.ema_momentum
+            with tf.control_dependencies(updates):
+                for w1, w2 in zip(self.ema_weights, params):
+                    new_w = ema_momentum * w1 + (1 - ema_momentum) * w2
+                    ema_updates.append(K.update(w1, new_w))
+
+            return ema_updates
+
+        def get_config(self):
+            config = {
+                'ema_momentum': self.ema_momentum,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+        def apply_ema_weights(self, bias_correction=True):
+            """备份原模型权重，然后将平均权重应用到模型上去。
+            """
+            self.old_weights = K.batch_get_value(self.model_weights)
+            ema_weights = K.batch_get_value(self.ema_weights)
+
+            if bias_correction:
+                iterations = K.eval(self.iterations)
+                scale = 1.0 - np.power(self.ema_momentum, iterations)
+                ema_weights = [weight / scale for weight in ema_weights]
+
+            K.batch_set_value(zip(self.model_weights, ema_weights))
+
+        def reset_old_weights(self):
+            """恢复模型到旧权重。
+            """
+            K.batch_set_value(zip(self.model_weights, self.old_weights))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_exponential_moving_average_v2(BaseOptimizer):
+    """返回新的优化器类，加入EMA（权重滑动平均）
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带EMA（权重滑动平均）的优化器
+        """
+        @insert_arguments(ema_momentum=0.999)
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+
+        def _create_slots(self, var_list):
+            super(NewOptimizer, self)._create_slots(var_list)
+            self.model_weights = var_list
+            self.ema_weights = []
+            for var in var_list:
+                self.ema_weights.append(self.add_slot(var, 'ema'))
+
+        def _resource_apply_dense(self, grad, var):
+            op = super(NewOptimizer, self)._resource_apply_dense(grad, var)
+            ema = self.get_slot(var, 'ema')
+            ema_momentum = self.ema_momentum
+            with tf.control_dependencies([op]):
+                return K.update(
+                    ema, ema * ema_momentum + var * (1.0 - ema_momentum)
+                )
+
+        def _resource_apply_sparse(self, grad, var, indices):
+            op = super(NewOptimizer,
+                       self)._resource_apply_sparse(grad, var, indices)
+            ema = self.get_slot(var, 'ema')
+            ema_momentum = self.ema_momentum
+            with tf.control_dependencies([op]):
+                return K.update(
+                    ema, ema * ema_momentum + var * (1.0 - ema_momentum)
+                )
+
+        def get_config(self):
+            config = {
+                'ema_momentum': self.ema_momentum,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+        def apply_ema_weights(self, bias_correction=True):
+            """备份原模型权重，然后将平均权重应用到模型上去。
+            """
+            self.old_weights = K.batch_get_value(self.model_weights)
+            ema_weights = K.batch_get_value(self.ema_weights)
+
+            if bias_correction:
+                iterations = K.eval(self.iterations)
+                scale = 1.0 - np.power(self.ema_momentum, iterations)
+                ema_weights = [weight / scale for weight in ema_weights]
+
+            K.batch_set_value(zip(self.model_weights, ema_weights))
+
+        def reset_old_weights(self):
+            """恢复模型到旧权重。
+            """
+            K.batch_set_value(zip(self.model_weights, self.old_weights))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_parameter_wise_lr(BaseOptimizer):
+    """返回新的优化器类，加入分参数学习率
+    主要场景就是给每层甚至每个参数设置不同的学习率。
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有分参数学习率的优化器
+        其中schedule是形如{name1: 2, name2: 0.1}的字典，
+        其实name1、name2是字符串，表示变量名包含name1的
+        参数学习率乘以2，变量名包含name2的参数学习率要
+        乘以0.1。
+        """
+        @insert_arguments(paramwise_lr_schedule={})
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+
+        @K.symbolic
+        def get_updates(self, loss, params):
+            old_update = K.update
+
+            def new_update(x, new_x):
+                if is_one_of(x, params):
+                    lr_multiplier = 1
+                    for k, v in self.paramwise_lr_schedule.items():
+                        if k in x.name:
+                            lr_multiplier *= v
+                    if lr_multiplier != 1:
+                        new_x = x + (new_x - x) * lr_multiplier
+                return old_update(x, new_x)
+
+            K.update = new_update
+            updates = super(NewOptimizer, self).get_updates(loss, params)
+            K.update = old_update
+
+            return updates
+
+        def get_config(self):
+            config = {
+                'paramwise_lr_schedule': self.paramwise_lr_schedule,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+@export_to_custom_objects
+def extend_with_parameter_wise_lr_v2(BaseOptimizer):
+    """返回新的优化器类，加入分参数学习率
+    主要场景就是给每层甚至每个参数设置不同的学习率。
+    """
+    class NewOptimizer(BaseOptimizer):
+        """带有分参数学习率的优化器
+        其中schedule是形如{name1: 2, name2: 0.1}的字典，
+        其实name1、name2是字符串，表示变量名包含name1的
+        参数学习率乘以2，变量名包含name2的参数学习率要
+        乘以0.1。
+        """
+        @insert_arguments(paramwise_lr_schedule={})
+        def __init__(self, *args, **kwargs):
+            super(NewOptimizer, self).__init__(*args, **kwargs)
+
+        def _resource_apply(self, grad, var, indices=None):
+            old_update = K.update
+
+            def new_update(x, new_x):
+                if x is var:
+                    lr_multiplier = 1
+                    for k, v in self.paramwise_lr_schedule.items():
+                        if k in x.name:
+                            lr_multiplier *= v
+                    if lr_multiplier != 1:
+                        new_x = x + (new_x - x) * lr_multiplier
+                return old_update(x, new_x)
+
+            K.update = new_update
+            op = super(NewOptimizer, self)._resource_apply(grad, var, indices)
+            K.update = old_update
+
+            return op
+
+        def get_config(self):
+            config = {
+                'paramwise_lr_schedule': self.paramwise_lr_schedule,
+            }
+            base_config = super(NewOptimizer, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    return NewOptimizer
+
+
+if is_tf_keras:
+    extend_with_weight_decay = extend_with_weight_decay_v2
+    extend_with_layer_adaptation = extend_with_layer_adaptation_v2
+    extend_with_piecewise_linear_lr = extend_with_piecewise_linear_lr_v2
+    extend_with_gradient_accumulation = extend_with_gradient_accumulation_v2
+    extend_with_lookahead = extend_with_lookahead_v2
+    extend_with_lazy_optimization = extend_with_lazy_optimization_v2
+    extend_with_exponential_moving_average = extend_with_exponential_moving_average_v2
+    extend_with_parameter_wise_lr = extend_with_parameter_wise_lr_v2
+    AdaFactor = AdaFactorV2
+else:
+    Adam = keras.optimizers.Adam
+    AdaFactor = AdaFactorV1
+
+AdaFactor.__name__ = 'AdaFactor'
+custom_objects = {
+    'Adam': Adam,
+    'AdaFactor': AdaFactor,
+}
+
+keras.utils.get_custom_objects().update(custom_objects)
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/snippets.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/snippets.py
new file mode 100644
index 0000000000000000000000000000000000000000..79ebfa21c1c9f57008857c2e10168be7db783490
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/snippets.py
@@ -0,0 +1,932 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 代码合集
+
+import os, sys, six, re, json
+import unicodedata
+import logging
+import numpy as np
+from collections import defaultdict
+from bert4keras.backend import K, keras, tf
+
+_open_ = open
+is_py2 = six.PY2
+
+if not is_py2:
+    basestring = str
+
+
+def to_array(*args):
+    """批量转numpy的array
+    """
+    results = [np.array(a) for a in args]
+    if len(args) == 1:
+        return results[0]
+    else:
+        return results
+
+
+def is_string(s):
+    """判断是否是字符串
+    """
+    return isinstance(s, basestring)
+
+
+def strQ2B(ustring):
+    """全角符号转对应的半角符号
+    """
+    rstring = ''
+    for uchar in ustring:
+        inside_code = ord(uchar)
+        # 全角空格直接转换
+        if inside_code == 12288:
+            inside_code = 32
+        # 全角字符（除空格）根据关系转化
+        elif (inside_code >= 65281 and inside_code <= 65374):
+            inside_code -= 65248
+        rstring += unichr(inside_code)
+    return rstring
+
+
+def string_matching(s, keywords):
+    """判断s是否至少包含keywords中的至少一个字符串
+    """
+    for k in keywords:
+        if re.search(k, s):
+            return True
+    return False
+
+
+def convert_to_unicode(text, encoding='utf-8', errors='ignore'):
+    """字符串转换为unicode格式（假设输入为utf-8格式）
+    """
+    if is_py2:
+        if isinstance(text, str):
+            text = text.decode(encoding, errors=errors)
+    else:
+        if isinstance(text, bytes):
+            text = text.decode(encoding, errors=errors)
+    return text
+
+
+def convert_to_str(text, encoding='utf-8', errors='ignore'):
+    """字符串转换为str格式（假设输入为utf-8格式）
+    """
+    if is_py2:
+        if isinstance(text, unicode):
+            text = text.encode(encoding, errors=errors)
+    else:
+        if isinstance(text, bytes):
+            text = text.decode(encoding, errors=errors)
+    return text
+
+
+def lowercase_and_normalize(text):
+    """转小写，并进行简单的标准化
+    """
+    if is_py2:
+        text = unicode(text)
+    text = text.lower()
+    text = unicodedata.normalize('NFD', text)
+    text = ''.join([ch for ch in text if unicodedata.category(ch) != 'Mn'])
+    return text
+
+
+class open:
+    """模仿python自带的open函数
+    作用：1.主要是为了同时兼容py2和py3；2.增加了索引功能，方便读取大文件。
+    """
+    def __init__(
+        self, name, mode='r', encoding=None, errors='strict', indexable=False
+    ):
+        self.name = name
+        if is_py2:
+            self.file = _open_(name, mode)
+        else:
+            self.file = _open_(name, mode, encoding=encoding, errors=errors)
+        self.encoding = encoding
+        self.errors = errors
+        self.iterator = None
+        if indexable:
+            if is_string(indexable) and os.path.exists(indexable):
+                self.offsets = json.load(_open_(indexable))
+            else:
+                self.create_indexes()
+                if is_string(indexable):
+                    json.dump(self.offsets, _open_(indexable, 'w'))
+
+    def create_indexes(self):
+        print('creating indexes ...')
+        self.offsets, offset = [], 0
+        pbar = keras.utils.Progbar(os.path.getsize(self.name))
+        while self.readline():
+            self.offsets.append(offset)
+            offset = self.tell()
+            pbar.update(offset)
+        self.seek(0)
+        print('indexes created.')
+
+    def __getitem__(self, key):
+        self.seek(self.offsets[key])
+        l = self.readline()
+        if self.encoding:
+            l = convert_to_unicode(l, self.encoding, self.errors)
+        return l
+
+    def __len__(self):
+        return len(self.offsets)
+
+    def __iter__(self):
+        for l in self.file:
+            if self.encoding:
+                l = convert_to_unicode(l, self.encoding, self.errors)
+            yield l
+
+    def next(self):
+        if self.iterator is None:
+            self.iterator = self.__iter__()
+        return next(self.iterator)
+
+    def __next__(self):
+        return self.next()
+
+    def read(self):
+        text = self.file.read()
+        if self.encoding:
+            text = convert_to_unicode(text, self.encoding, self.errors)
+        return text
+
+    def readline(self):
+        text = self.file.readline()
+        if self.encoding:
+            text = convert_to_unicode(text, self.encoding, self.errors)
+        return text
+
+    def readlines(self):
+        if self.encoding:
+            return [
+                convert_to_unicode(text, self.encoding, self.errors)
+                for text in self.file.readlines()
+            ]
+        else:
+            return self.file.readlines()
+
+    def write(self, text):
+        if self.encoding:
+            text = convert_to_str(text, self.encoding, self.errors)
+        self.file.write(text)
+
+    def flush(self):
+        self.file.flush()
+
+    def close(self):
+        self.file.close()
+
+    def tell(self):
+        return self.file.tell()
+
+    def seek(self, offset=0):
+        return self.file.seek(offset)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, tb):
+        self.close()
+
+
+def parallel_apply_generator(
+    func, iterable, workers, max_queue_size, dummy=False, random_seeds=True
+):
+    """多进程或多线程地将func应用到iterable的每个元素中。
+    注意这个apply是异步且无序的，也就是说依次输入a,b,c，但是
+    输出可能是func(c), func(a), func(b)。结果将作为一个
+    generator返回，其中每个item是输入的序号以及该输入对应的
+    处理结果。
+    参数：
+        dummy: False是多进程/线性，True则是多线程/线性；
+        random_seeds: 每个进程的随机种子。
+    """
+    if dummy:
+        from multiprocessing.dummy import Pool, Queue
+    else:
+        from multiprocessing import Pool, Queue
+
+    in_queue, out_queue, seed_queue = Queue(max_queue_size), Queue(), Queue()
+    if random_seeds is True:
+        random_seeds = [None] * workers
+    elif random_seeds is None or random_seeds is False:
+        random_seeds = []
+    for seed in random_seeds:
+        seed_queue.put(seed)
+
+    def worker_step(in_queue, out_queue):
+        """单步函数包装成循环执行
+        """
+        if not seed_queue.empty():
+            np.random.seed(seed_queue.get())
+        while True:
+            i, d = in_queue.get()
+            r = func(d)
+            out_queue.put((i, r))
+
+    # 启动多进程/线程
+    pool = Pool(workers, worker_step, (in_queue, out_queue))
+
+    # 存入数据，取出结果
+    in_count, out_count = 0, 0
+    for i, d in enumerate(iterable):
+        in_count += 1
+        while True:
+            try:
+                in_queue.put((i, d), block=False)
+                break
+            except six.moves.queue.Full:
+                while out_queue.qsize() > max_queue_size:
+                    yield out_queue.get()
+                    out_count += 1
+        if out_queue.qsize() > 0:
+            yield out_queue.get()
+            out_count += 1
+
+    while out_count != in_count:
+        yield out_queue.get()
+        out_count += 1
+
+    pool.terminate()
+
+
+def parallel_apply(
+    func,
+    iterable,
+    workers,
+    max_queue_size,
+    callback=None,
+    dummy=False,
+    random_seeds=True,
+    unordered=True
+):
+    """多进程或多线程地将func应用到iterable的每个元素中。
+    注意这个apply是异步且无序的，也就是说依次输入a,b,c，但是
+    输出可能是func(c), func(a), func(b)。
+    参数：
+        callback: 处理单个输出的回调函数；
+        dummy: False是多进程/线性，True则是多线程/线性；
+        random_seeds: 每个进程的随机种子；
+        unordered: 若为False，则按照输入顺序返回，仅当callback为None时生效。
+    """
+    generator = parallel_apply_generator(
+        func, iterable, workers, max_queue_size, dummy, random_seeds
+    )
+
+    if callback is None:
+        if unordered:
+            return [d for i, d in generator]
+        else:
+            results = sorted(generator, key=lambda d: d[0])
+            return [d for i, d in results]
+    else:
+        for i, d in generator:
+            callback(d)
+
+
+def sequence_padding(inputs, length=None, value=0, seq_dims=1, mode='post'):
+    """Numpy函数，将序列padding到同一长度
+    """
+    if length is None:
+        length = np.max([np.shape(x)[:seq_dims] for x in inputs], axis=0)
+    elif not hasattr(length, '__getitem__'):
+        length = [length]
+
+    slices = [np.s_[:length[i]] for i in range(seq_dims)]
+    slices = tuple(slices) if len(slices) > 1 else slices[0]
+    pad_width = [(0, 0) for _ in np.shape(inputs[0])]
+
+    outputs = []
+    for x in inputs:
+        x = x[slices]
+        for i in range(seq_dims):
+            if mode == 'post':
+                pad_width[i] = (0, length[i] - np.shape(x)[i])
+            elif mode == 'pre':
+                pad_width[i] = (length[i] - np.shape(x)[i], 0)
+            else:
+                raise ValueError('"mode" argument must be "post" or "pre".')
+        x = np.pad(x, pad_width, 'constant', constant_values=value)
+        outputs.append(x)
+
+    return np.array(outputs)
+
+
+def truncate_sequences(maxlen, indices, *sequences):
+    """截断总长度至不超过maxlen
+    """
+    sequences = [s for s in sequences if s]
+    if not isinstance(indices, (list, tuple)):
+        indices = [indices] * len(sequences)
+
+    while True:
+        lengths = [len(s) for s in sequences]
+        if sum(lengths) > maxlen:
+            i = np.argmax(lengths)
+            sequences[i].pop(indices[i])
+        else:
+            return sequences
+
+
+def text_segmentate(text, maxlen, seps='\n', strips=None):
+    """将文本按照标点符号划分为若干个短句
+    """
+    text = text.strip().strip(strips)
+    if seps and len(text) > maxlen:
+        pieces = text.split(seps[0])
+        text, texts = '', []
+        for i, p in enumerate(pieces):
+            if text and p and len(text) + len(p) > maxlen - 1:
+                texts.extend(text_segmentate(text, maxlen, seps[1:], strips))
+                text = ''
+            if i + 1 == len(pieces):
+                text = text + p
+            else:
+                text = text + p + seps[0]
+        if text:
+            texts.extend(text_segmentate(text, maxlen, seps[1:], strips))
+        return texts
+    else:
+        return [text]
+
+
+def is_one_of(x, ys):
+    """判断x是否在ys之中
+    等价于x in ys，但有些情况下x in ys会报错
+    """
+    for y in ys:
+        if x is y:
+            return True
+    return False
+
+
+class DataGenerator(object):
+    """数据生成器模版
+    """
+    def __init__(self, data, batch_size=32, buffer_size=None):
+        self.data = data
+        self.batch_size = batch_size
+        if hasattr(self.data, '__len__'):
+            self.steps = len(self.data) // self.batch_size
+            if len(self.data) % self.batch_size != 0:
+                self.steps += 1
+        else:
+            self.steps = None
+        self.buffer_size = buffer_size or batch_size * 1000
+
+    def __len__(self):
+        return self.steps
+
+    def sample(self, random=False):
+        """采样函数，每个样本同时返回一个is_end标记
+        """
+        if random:
+            if self.steps is None:
+
+                def generator():
+                    caches, isfull = [], False
+                    for d in self.data:
+                        caches.append(d)
+                        if isfull:
+                            i = np.random.randint(len(caches))
+                            yield caches.pop(i)
+                        elif len(caches) == self.buffer_size:
+                            isfull = True
+                    while caches:
+                        i = np.random.randint(len(caches))
+                        yield caches.pop(i)
+
+            else:
+
+                def generator():
+                    for i in np.random.permutation(len(self.data)):
+                        yield self.data[i]
+
+            data = generator()
+        else:
+            data = iter(self.data)
+
+        d_current = next(data)
+        for d_next in data:
+            yield False, d_current
+            d_current = d_next
+
+        yield True, d_current
+
+    def __iter__(self, random=False):
+        raise NotImplementedError
+
+    def forfit(self, random=True):
+        while True:
+            for d in self.__iter__(random):
+                yield d
+
+    def fortest(self, random=False):
+        while True:
+            for d in self.__iter__(random):
+                yield d[0]
+
+    def to_dataset(self, types, shapes, names=None, padded_batch=False):
+        """转为tf.data.Dataset格式
+        如果传入names的话，自动把数据包装成dict形式。
+        """
+        if names is None:
+
+            generator = self.forfit
+
+        else:
+
+            if is_string(names):
+                warps = lambda k, v: {k: v}
+            elif is_string(names[0]):
+                warps = lambda k, v: dict(zip(k, v))
+            else:
+                warps = lambda k, v: tuple(
+                    dict(zip(i, j)) for i, j in zip(k, v)
+                )
+
+            def generator():
+                for d in self.forfit():
+                    yield warps(names, d)
+
+            types = warps(names, types)
+            shapes = warps(names, shapes)
+
+        if padded_batch:
+            dataset = tf.data.Dataset.from_generator(
+                generator, output_types=types
+            )
+            dataset = dataset.padded_batch(self.batch_size, shapes)
+        else:
+            dataset = tf.data.Dataset.from_generator(
+                generator, output_types=types, output_shapes=shapes
+            )
+            dataset = dataset.batch(self.batch_size)
+
+        return dataset
+
+
+class ViterbiDecoder(object):
+    """Viterbi解码算法基类
+    """
+    def __init__(self, trans, starts=None, ends=None):
+        self.trans = trans
+        self.num_labels = len(trans)
+        self.non_starts = []
+        self.non_ends = []
+        if starts is not None:
+            for i in range(self.num_labels):
+                if i not in starts:
+                    self.non_starts.append(i)
+        if ends is not None:
+            for i in range(self.num_labels):
+                if i not in ends:
+                    self.non_ends.append(i)
+
+    def decode(self, nodes):
+        """nodes.shape=[seq_len, num_labels]
+        """
+        # 预处理
+        nodes[0, self.non_starts] -= np.inf
+        nodes[-1, self.non_ends] -= np.inf
+
+        # 动态规划
+        labels = np.arange(self.num_labels).reshape((1, -1))
+        scores = nodes[0].reshape((-1, 1))
+        paths = labels
+        for l in range(1, len(nodes)):
+            M = scores + self.trans + nodes[l].reshape((1, -1))
+            idxs = M.argmax(0)
+            scores = M.max(0).reshape((-1, 1))
+            paths = np.concatenate([paths[:, idxs], labels], 0)
+
+        # 最优路径
+        return paths[:, scores[:, 0].argmax()]
+
+
+def softmax(x, axis=-1):
+    """numpy版softmax
+    """
+    x = x - x.max(axis=axis, keepdims=True)
+    x = np.exp(x)
+    return x / x.sum(axis=axis, keepdims=True)
+
+
+class AutoRegressiveDecoder(object):
+    """通用自回归生成模型解码基类
+    包含beam search和random sample两种策略
+    """
+    def __init__(self, start_id, end_id, maxlen, minlen=1):
+        self.start_id = start_id
+        self.end_id = end_id
+        self.maxlen = maxlen
+        self.minlen = minlen
+        self.models = {}
+        if start_id is None:
+            self.first_output_ids = np.empty((1, 0), dtype=int)
+        else:
+            self.first_output_ids = np.array([[self.start_id]])
+
+    @staticmethod
+    def wraps(default_rtype='probas', use_states=False):
+        """用来进一步完善predict函数
+        目前包含：1. 设置rtype参数，并做相应处理；
+                  2. 确定states的使用，并做相应处理；
+                  3. 设置温度参数，并做相应处理。
+        """
+        def actual_decorator(predict):
+            def new_predict(
+                self,
+                inputs,
+                output_ids,
+                states,
+                temperature=1,
+                rtype=default_rtype
+            ):
+                assert rtype in ['probas', 'logits']
+                prediction = predict(self, inputs, output_ids, states)
+
+                if not use_states:
+                    prediction = (prediction, None)
+
+                if default_rtype == 'logits':
+                    prediction = (
+                        softmax(prediction[0] / temperature), prediction[1]
+                    )
+                elif temperature != 1:
+                    probas = np.power(prediction[0], 1.0 / temperature)
+                    probas = probas / probas.sum(axis=-1, keepdims=True)
+                    prediction = (probas, prediction[1])
+
+                if rtype == 'probas':
+                    return prediction
+                else:
+                    return np.log(prediction[0] + 1e-12), prediction[1]
+
+            return new_predict
+
+        return actual_decorator
+
+    def last_token(self, model):
+        """创建一个只返回最后一个token输出的新Model
+        """
+        if model not in self.models:
+            outputs = [
+                keras.layers.Lambda(lambda x: x[:, -1])(output)
+                for output in model.outputs
+            ]
+            self.models[model] = keras.models.Model(model.inputs, outputs)
+
+        return self.models[model]
+
+    def predict(self, inputs, output_ids, states=None):
+        """用户需自定义递归预测函数
+        说明：定义的时候，需要用wraps方法进行装饰，传入default_rtype和use_states，
+             其中default_rtype为字符串logits或probas，probas时返回归一化的概率，
+             rtype=logits时则返回softmax前的结果或者概率对数。
+        返回：二元组 (得分或概率, states)
+        """
+        raise NotImplementedError
+
+    def beam_search(self, inputs, topk, states=None, temperature=1, min_ends=1):
+        """beam search解码
+        说明：这里的topk即beam size；
+        返回：最优解码序列。
+        """
+        inputs = [np.array([i]) for i in inputs]
+        output_ids, output_scores = self.first_output_ids, np.zeros(1)
+        for step in range(self.maxlen):
+            scores, states = self.predict(
+                inputs, output_ids, states, temperature, 'logits'
+            )  # 计算当前得分
+            if step == 0:  # 第1步预测后将输入重复topk次
+                inputs = [np.repeat(i, topk, axis=0) for i in inputs]
+            scores = output_scores.reshape((-1, 1)) + scores  # 综合累积得分
+            indices = scores.argpartition(-topk, axis=None)[-topk:]  # 仅保留topk
+            indices_1 = indices // scores.shape[1]  # 行索引
+            indices_2 = (indices % scores.shape[1]).reshape((-1, 1))  # 列索引
+            output_ids = np.concatenate([output_ids[indices_1], indices_2],
+                                        1)  # 更新输出
+            output_scores = np.take_along_axis(
+                scores, indices, axis=None
+            )  # 更新得分
+            is_end = output_ids[:, -1] == self.end_id  # 标记是否以end标记结束
+            end_counts = (output_ids == self.end_id).sum(1)  # 统计出现的end标记
+            if output_ids.shape[1] >= self.minlen:  # 最短长度判断
+                best = output_scores.argmax()  # 得分最大的那个
+                if is_end[best] and end_counts[best] >= min_ends:  # 如果已经终止
+                    return output_ids[best]  # 直接输出
+                else:  # 否则，只保留未完成部分
+                    flag = ~is_end | (end_counts < min_ends)  # 标记未完成序列
+                    if not flag.all():  # 如果有已完成的
+                        inputs = [i[flag] for i in inputs]  # 扔掉已完成序列
+                        output_ids = output_ids[flag]  # 扔掉已完成序列
+                        output_scores = output_scores[flag]  # 扔掉已完成序列
+                        end_counts = end_counts[flag]  # 扔掉已完成end计数
+                        topk = flag.sum()  # topk相应变化
+        # 达到长度直接输出
+        return output_ids[output_scores.argmax()]
+
+    def random_sample(
+        self,
+        inputs,
+        n,
+        topk=None,
+        topp=None,
+        states=None,
+        temperature=1,
+        min_ends=1
+    ):
+        """随机采样n个结果
+        说明：非None的topk表示每一步只从概率最高的topk个中采样；而非None的topp
+             表示每一步只从概率最高的且概率之和刚好达到topp的若干个token中采样。
+        返回：n个解码序列组成的list。
+        """
+        inputs = [np.array([i]) for i in inputs]
+        output_ids = self.first_output_ids
+        results = []
+        for step in range(self.maxlen):
+            probas, states = self.predict(
+                inputs, output_ids, states, temperature, 'probas'
+            )  # 计算当前概率
+            probas /= probas.sum(axis=1, keepdims=True)  # 确保归一化
+            if step == 0:  # 第1步预测后将结果重复n次
+                probas = np.repeat(probas, n, axis=0)
+                inputs = [np.repeat(i, n, axis=0) for i in inputs]
+                output_ids = np.repeat(output_ids, n, axis=0)
+            if topk is not None:
+                k_indices = probas.argpartition(-topk,
+                                                axis=1)[:, -topk:]  # 仅保留topk
+                probas = np.take_along_axis(probas, k_indices, axis=1)  # topk概率
+                probas /= probas.sum(axis=1, keepdims=True)  # 重新归一化
+            if topp is not None:
+                p_indices = probas.argsort(axis=1)[:, ::-1]  # 从高到低排序
+                probas = np.take_along_axis(probas, p_indices, axis=1)  # 排序概率
+                cumsum_probas = np.cumsum(probas, axis=1)  # 累积概率
+                flag = np.roll(cumsum_probas >= topp, 1, axis=1)  # 标记超过topp的部分
+                flag[:, 0] = False  # 结合上面的np.roll，实现平移一位的效果
+                probas[flag] = 0  # 后面的全部置零
+                probas /= probas.sum(axis=1, keepdims=True)  # 重新归一化
+            sample_func = lambda p: np.random.choice(len(p), p=p)  # 按概率采样函数
+            sample_ids = np.apply_along_axis(sample_func, 1, probas)  # 执行采样
+            sample_ids = sample_ids.reshape((-1, 1))  # 对齐形状
+            if topp is not None:
+                sample_ids = np.take_along_axis(
+                    p_indices, sample_ids, axis=1
+                )  # 对齐原id
+            if topk is not None:
+                sample_ids = np.take_along_axis(
+                    k_indices, sample_ids, axis=1
+                )  # 对齐原id
+            output_ids = np.concatenate([output_ids, sample_ids], 1)  # 更新输出
+            is_end = output_ids[:, -1] == self.end_id  # 标记是否以end标记结束
+            end_counts = (output_ids == self.end_id).sum(1)  # 统计出现的end标记
+            if output_ids.shape[1] >= self.minlen:  # 最短长度判断
+                flag = is_end & (end_counts >= min_ends)  # 标记已完成序列
+                if flag.any():  # 如果有已完成的
+                    for ids in output_ids[flag]:  # 存好已完成序列
+                        results.append(ids)
+                    flag = (flag == False)  # 标记未完成序列
+                    inputs = [i[flag] for i in inputs]  # 只保留未完成部分输入
+                    output_ids = output_ids[flag]  # 只保留未完成部分候选集
+                    end_counts = end_counts[flag]  # 只保留未完成部分end计数
+                    if len(output_ids) == 0:
+                        break
+        # 如果还有未完成序列，直接放入结果
+        for ids in output_ids:
+            results.append(ids)
+        # 返回结果
+        return results
+
+
+def insert_arguments(**arguments):
+    """装饰器，为类方法增加参数
+    （主要用于类的__init__方法）
+    """
+    def actual_decorator(func):
+        def new_func(self, *args, **kwargs):
+            for k, v in arguments.items():
+                if k in kwargs:
+                    v = kwargs.pop(k)
+                setattr(self, k, v)
+            return func(self, *args, **kwargs)
+
+        return new_func
+
+    return actual_decorator
+
+
+def delete_arguments(*arguments):
+    """装饰器，为类方法删除参数
+    （主要用于类的__init__方法）
+    """
+    def actual_decorator(func):
+        def new_func(self, *args, **kwargs):
+            for k in arguments:
+                if k in kwargs:
+                    raise TypeError(
+                        '%s got an unexpected keyword argument \'%s\'' %
+                        (self.__class__.__name__, k)
+                    )
+            return func(self, *args, **kwargs)
+
+        return new_func
+
+    return actual_decorator
+
+
+def longest_common_substring(source, target):
+    """最长公共子串（source和target的最长公共切片区间）
+    返回：子串长度, 所在区间（四元组）
+    注意：最长公共子串可能不止一个，所返回的区间只代表其中一个。
+    """
+    c, l, span = defaultdict(int), 0, (0, 0, 0, 0)
+    for i, si in enumerate(source, 1):
+        for j, tj in enumerate(target, 1):
+            if si == tj:
+                c[i, j] = c[i - 1, j - 1] + 1
+                if c[i, j] > l:
+                    l = c[i, j]
+                    span = (i - l, i, j - l, j)
+    return l, span
+
+
+def longest_common_subsequence(source, target):
+    """最长公共子序列（source和target的最长非连续子序列）
+    返回：子序列长度, 映射关系（映射对组成的list）
+    注意：最长公共子序列可能不止一个，所返回的映射只代表其中一个。
+    """
+    c = defaultdict(int)
+    for i, si in enumerate(source, 1):
+        for j, tj in enumerate(target, 1):
+            if si == tj:
+                c[i, j] = c[i - 1, j - 1] + 1
+            elif c[i, j - 1] > c[i - 1, j]:
+                c[i, j] = c[i, j - 1]
+            else:
+                c[i, j] = c[i - 1, j]
+    l, mapping = c[len(source), len(target)], []
+    i, j = len(source) - 1, len(target) - 1
+    while len(mapping) < l:
+        if source[i] == target[j]:
+            mapping.append((i, j))
+            i, j = i - 1, j - 1
+        elif c[i + 1, j] > c[i, j + 1]:
+            j = j - 1
+        else:
+            i = i - 1
+    return l, mapping[::-1]
+
+
+def orthogonally_resize(a, new_shape, window=2):
+    """简单的正交化缩放矩阵
+    """
+    assert a.ndim == len(new_shape)
+    slices, a_norm, w = [], np.linalg.norm(a), window
+    for i, (d1, d2) in enumerate(zip(a.shape, new_shape)):
+        if d1 != d2:
+            k = d2 // d1 + int(d2 % d1 != 0)
+            if k > 1:
+                assert d1 % w == 0
+                a = a.reshape(a.shape[:i] + (d1 // w, w) + a.shape[i + 1:])
+                a = np.repeat(a, k, axis=i)
+                a = a.reshape(a.shape[:i] + (d1 * k,) + a.shape[i + 2:])
+        slices.append(np.s_[:d2])
+    a = a[tuple(slices)]
+    return a / np.linalg.norm(a) * a_norm
+
+
+class WebServing(object):
+    """简单的Web接口
+    用法：
+        arguments = {'text': (None, True), 'n': (int, False)}
+        web = WebServing(port=8864)
+        web.route('/gen_synonyms', gen_synonyms, arguments)
+        web.start()
+        # 然后访问 http://127.0.0.1:8864/gen_synonyms?text=你好
+    说明：
+        基于bottlepy简单封装，仅作为临时测试使用，不保证性能。
+        目前仅保证支持 Tensorflow 1.x + Keras <= 2.3.1。
+        欢迎有经验的开发者帮忙改进。
+    依赖：
+        pip install bottle
+        pip install paste
+        （如果不用 server='paste' 的话，可以不装paste库）
+    """
+    def __init__(self, host='0.0.0.0', port=8000, server='paste'):
+
+        import bottle
+
+        self.host = host
+        self.port = port
+        self.server = server
+        self.graph = tf.get_default_graph()
+        self.sess = K.get_session()
+        self.set_session = K.set_session
+        self.bottle = bottle
+
+    def wraps(self, func, arguments, method='GET'):
+        """封装为接口函数
+        参数：
+            func：要转换为接口的函数，需要保证输出可以json化，即需要
+                  保证 json.dumps(func(inputs)) 能被执行成功；
+            arguments：声明func所需参数，其中key为参数名，value[0]为
+                       对应的转换函数（接口获取到的参数值都是字符串
+                       型），value[1]为该参数是否必须；
+            method：GET或者POST。
+        """
+        def new_func():
+            outputs = {'code': 0, 'desc': u'succeeded', 'data': {}}
+            kwargs = {}
+            for key, value in arguments.items():
+                if method == 'GET':
+                    result = self.bottle.request.GET.getunicode(key)
+                else:
+                    result = self.bottle.request.POST.getunicode(key)
+                if result is None:
+                    if value[1]:
+                        outputs['code'] = 1
+                        outputs['desc'] = 'lack of "%s" argument' % key
+                        return json.dumps(outputs, ensure_ascii=False)
+                else:
+                    if value[0] is not None:
+                        result = value[0](result)
+                    kwargs[key] = result
+            try:
+                with self.graph.as_default():
+                    self.set_session(self.sess)
+                    outputs['data'] = func(**kwargs)
+            except Exception as e:
+                outputs['code'] = 2
+                outputs['desc'] = str(e)
+            return json.dumps(outputs, ensure_ascii=False)
+
+        return new_func
+
+    def route(self, path, func, arguments, method='GET'):
+        """添加接口
+        """
+        func = self.wraps(func, arguments, method)
+        self.bottle.route(path, method=method)(func)
+
+    def start(self):
+        """启动服务
+        """
+        self.bottle.run(host=self.host, port=self.port, server=self.server)
+
+
+class Hook:
+    """注入uniout模块，实现import时才触发
+    """
+    def __init__(self, module):
+        self.module = module
+
+    def __getattr__(self, attr):
+        """使得 from bert4keras.snippets import uniout
+        等效于 import uniout （自动识别Python版本，Python3
+        下则无操作。）
+        """
+        if attr == 'uniout':
+            if is_py2:
+                import uniout
+        else:
+            return getattr(self.module, attr)
+
+
+Hook.__name__ = __name__
+sys.modules[__name__] = Hook(sys.modules[__name__])
+del Hook
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/tokenizers.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/tokenizers.py
new file mode 100644
index 0000000000000000000000000000000000000000..04b41923ca7ae0a4c7838ce4224d296e09d6c55e
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/bertkeras/tokenizers.py
@@ -0,0 +1,491 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 分词函数
+
+import unicodedata, re
+from bert4keras.snippets import is_string, is_py2
+from bert4keras.snippets import open
+from bert4keras.snippets import convert_to_unicode
+from bert4keras.snippets import truncate_sequences
+from bert4keras.snippets import lowercase_and_normalize
+
+
+def load_vocab(dict_path, encoding='utf-8', simplified=False, startswith=None):
+    """从bert的词典文件中读取词典
+    """
+    token_dict = {}
+    with open(dict_path, encoding=encoding) as reader:
+        for line in reader:
+            token = line.split()
+            token = token[0] if token else line.strip()
+            token_dict[token] = len(token_dict)
+
+    if simplified:  # 过滤冗余部分token
+        new_token_dict, keep_tokens = {}, []
+        startswith = startswith or []
+        for t in startswith:
+            new_token_dict[t] = len(new_token_dict)
+            keep_tokens.append(token_dict[t])
+
+        for t, _ in sorted(token_dict.items(), key=lambda s: s[1]):
+            if t not in new_token_dict and not Tokenizer._is_redundant(t):
+                new_token_dict[t] = len(new_token_dict)
+                keep_tokens.append(token_dict[t])
+
+        return new_token_dict, keep_tokens
+    else:
+        return token_dict
+
+
+def save_vocab(dict_path, token_dict, encoding='utf-8'):
+    """将词典（比如精简过的）保存为文件
+    """
+    with open(dict_path, 'w', encoding=encoding) as writer:
+        for k, v in sorted(token_dict.items(), key=lambda s: s[1]):
+            writer.write(k + '\n')
+
+
+class TokenizerBase(object):
+    """分词器基类
+    """
+    def __init__(
+        self,
+        token_start='[CLS]',
+        token_end='[SEP]',
+        pre_tokenize=None,
+        token_translate=None
+    ):
+        """参数说明：
+        pre_tokenize：外部传入的分词函数，用作对文本进行预分词。如果传入
+                      pre_tokenize，则先执行pre_tokenize(text)，然后在它
+                      的基础上执行原本的tokenize函数；
+        token_translate：映射字典，主要用在tokenize之后，将某些特殊的token
+                         替换为对应的token。
+        """
+        self._token_pad = '[PAD]'
+        self._token_unk = '[UNK]'
+        self._token_mask = '[MASK]'
+        self._token_start = token_start
+        self._token_end = token_end
+        self._pre_tokenize = pre_tokenize
+        self._token_translate = token_translate or {}
+        self._token_translate_inv = {
+            v: k
+            for k, v in self._token_translate.items()
+        }
+
+    def tokenize(self, text, maxlen=None):
+        """分词函数
+        """
+        tokens = [
+            self._token_translate.get(token) or token
+            for token in self._tokenize(text)
+        ]
+        if self._token_start is not None:
+            tokens.insert(0, self._token_start)
+        if self._token_end is not None:
+            tokens.append(self._token_end)
+
+        if maxlen is not None:
+            index = int(self._token_end is not None) + 1
+            truncate_sequences(maxlen, -index, tokens)
+
+        return tokens
+
+    def token_to_id(self, token):
+        """token转换为对应的id
+        """
+        raise NotImplementedError
+
+    def tokens_to_ids(self, tokens):
+        """token序列转换为对应的id序列
+        """
+        return [self.token_to_id(token) for token in tokens]
+
+    def encode(
+        self,
+        first_text,
+        second_text=None,
+        maxlen=None,
+        pattern='S*E*E',
+        truncate_from='right'
+    ):
+        """输出文本对应token id和segment id
+        """
+        if is_string(first_text):
+            first_tokens = self.tokenize(first_text)
+        else:
+            first_tokens = first_text
+
+        if second_text is None:
+            second_tokens = None
+        elif is_string(second_text):
+            second_tokens = self.tokenize(second_text)
+        else:
+            second_tokens = second_text
+
+        if maxlen is not None:
+            if truncate_from == 'right':
+                index = -int(self._token_end is not None) - 1
+            elif truncate_from == 'left':
+                index = int(self._token_start is not None)
+            else:
+                index = truncate_from
+            if second_text is not None and pattern == 'S*E*E':
+                maxlen += 1
+            truncate_sequences(maxlen, index, first_tokens, second_tokens)
+
+        first_token_ids = self.tokens_to_ids(first_tokens)
+        first_segment_ids = [0] * len(first_token_ids)
+
+        if second_text is not None:
+            if pattern == 'S*E*E':
+                idx = int(bool(self._token_start))
+                second_tokens = second_tokens[idx:]
+            second_token_ids = self.tokens_to_ids(second_tokens)
+            second_segment_ids = [1] * len(second_token_ids)
+            first_token_ids.extend(second_token_ids)
+            first_segment_ids.extend(second_segment_ids)
+
+        return first_token_ids, first_segment_ids
+
+    def id_to_token(self, i):
+        """id序列为对应的token
+        """
+        raise NotImplementedError
+
+    def ids_to_tokens(self, ids):
+        """id序列转换为对应的token序列
+        """
+        return [self.id_to_token(i) for i in ids]
+
+    def decode(self, ids):
+        """转为可读文本
+        """
+        raise NotImplementedError
+
+    def _tokenize(self, text):
+        """基本分词函数
+        """
+        raise NotImplementedError
+
+
+class Tokenizer(TokenizerBase):
+    """Bert原生分词器
+    纯Python实现，代码修改自keras_bert的tokenizer实现
+    """
+    def __init__(
+        self, token_dict, do_lower_case=False, word_maxlen=200, **kwargs
+    ):
+        super(Tokenizer, self).__init__(**kwargs)
+        if is_string(token_dict):
+            token_dict = load_vocab(token_dict)
+
+        self._do_lower_case = do_lower_case
+        self._token_dict = token_dict
+        self._token_dict_inv = {v: k for k, v in token_dict.items()}
+        self._vocab_size = len(token_dict)
+        self._word_maxlen = word_maxlen
+
+        for token in ['pad', 'unk', 'mask', 'start', 'end']:
+            try:
+                _token_id = token_dict[getattr(self, '_token_%s' % token)]
+                setattr(self, '_token_%s_id' % token, _token_id)
+            except:
+                pass
+
+    def token_to_id(self, token):
+        """token转换为对应的id
+        """
+        return self._token_dict.get(token, self._token_unk_id)
+
+    def id_to_token(self, i):
+        """id转换为对应的token
+        """
+        return self._token_dict_inv[i]
+
+    def decode(self, ids, tokens=None):
+        """转为可读文本
+        """
+        tokens = tokens or self.ids_to_tokens(ids)
+        tokens = [token for token in tokens if not self._is_special(token)]
+
+        text, flag = '', False
+        for i, token in enumerate(tokens):
+            if token[:2] == '##':
+                text += token[2:]
+            elif len(token) == 1 and self._is_cjk_character(token):
+                text += token
+            elif len(token) == 1 and self._is_punctuation(token):
+                text += token
+                text += ' '
+            elif i > 0 and self._is_cjk_character(text[-1]):
+                text += token
+            else:
+                text += ' '
+                text += token
+
+        text = re.sub(' +', ' ', text)
+        text = re.sub('\' (re|m|s|t|ve|d|ll) ', '\'\\1 ', text)
+        punctuation = self._cjk_punctuation() + '+-/={(<['
+        punctuation_regex = '|'.join([re.escape(p) for p in punctuation])
+        punctuation_regex = '(%s) ' % punctuation_regex
+        text = re.sub(punctuation_regex, '\\1', text)
+        text = re.sub('(\d\.) (\d)', '\\1\\2', text)
+
+        return text.strip()
+
+    def _tokenize(self, text, pre_tokenize=True):
+        """基本分词函数
+        """
+        if self._do_lower_case:
+            text = lowercase_and_normalize(text)
+
+        if pre_tokenize and self._pre_tokenize is not None:
+            tokens = []
+            for token in self._pre_tokenize(text):
+                if token in self._token_dict:
+                    tokens.append(token)
+                else:
+                    tokens.extend(self._tokenize(token, False))
+            return tokens
+
+        spaced = ''
+        for ch in text:
+            if self._is_punctuation(ch) or self._is_cjk_character(ch):
+                spaced += ' ' + ch + ' '
+            elif self._is_space(ch):
+                spaced += ' '
+            elif ord(ch) == 0 or ord(ch) == 0xfffd or self._is_control(ch):
+                continue
+            else:
+                spaced += ch
+
+        tokens = []
+        for word in spaced.strip().split():
+            tokens.extend(self._word_piece_tokenize(word))
+
+        return tokens
+
+    def _word_piece_tokenize(self, word):
+        """word内分成subword
+        """
+        if len(word) > self._word_maxlen:
+            return [word]
+
+        tokens, start, end = [], 0, 0
+        while start < len(word):
+            end = len(word)
+            while end > start:
+                sub = word[start:end]
+                if start > 0:
+                    sub = '##' + sub
+                if sub in self._token_dict:
+                    break
+                end -= 1
+            if start == end:
+                return [word]
+            else:
+                tokens.append(sub)
+                start = end
+
+        return tokens
+
+    @staticmethod
+    def stem(token):
+        """获取token的“词干”（如果是##开头，则自动去掉##）
+        """
+        if token[:2] == '##':
+            return token[2:]
+        else:
+            return token
+
+    @staticmethod
+    def _is_space(ch):
+        """空格类字符判断
+        """
+        return ch == ' ' or ch == '\n' or ch == '\r' or ch == '\t' or \
+            unicodedata.category(ch) == 'Zs'
+
+    @staticmethod
+    def _is_punctuation(ch):
+        """标点符号类字符判断（全/半角均在此内）
+        提醒：unicodedata.category这个函数在py2和py3下的
+        表现可能不一样，比如u'§'字符，在py2下的结果为'So'，
+        在py3下的结果是'Po'。
+        """
+        code = ord(ch)
+        return 33 <= code <= 47 or \
+            58 <= code <= 64 or \
+            91 <= code <= 96 or \
+            123 <= code <= 126 or \
+            unicodedata.category(ch).startswith('P')
+
+    @staticmethod
+    def _cjk_punctuation():
+        return u'\uff02\uff03\uff04\uff05\uff06\uff07\uff08\uff09\uff0a\uff0b\uff0c\uff0d\uff0f\uff1a\uff1b\uff1c\uff1d\uff1e\uff20\uff3b\uff3c\uff3d\uff3e\uff3f\uff40\uff5b\uff5c\uff5d\uff5e\uff5f\uff60\uff62\uff63\uff64\u3000\u3001\u3003\u3008\u3009\u300a\u300b\u300c\u300d\u300e\u300f\u3010\u3011\u3014\u3015\u3016\u3017\u3018\u3019\u301a\u301b\u301c\u301d\u301e\u301f\u3030\u303e\u303f\u2013\u2014\u2018\u2019\u201b\u201c\u201d\u201e\u201f\u2026\u2027\ufe4f\ufe51\ufe54\u00b7\uff01\uff1f\uff61\u3002'
+
+    @staticmethod
+    def _is_cjk_character(ch):
+        """CJK类字符判断（包括中文字符也在此列）
+        参考：https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        """
+        code = ord(ch)
+        return 0x4E00 <= code <= 0x9FFF or \
+            0x3400 <= code <= 0x4DBF or \
+            0x20000 <= code <= 0x2A6DF or \
+            0x2A700 <= code <= 0x2B73F or \
+            0x2B740 <= code <= 0x2B81F or \
+            0x2B820 <= code <= 0x2CEAF or \
+            0xF900 <= code <= 0xFAFF or \
+            0x2F800 <= code <= 0x2FA1F
+
+    @staticmethod
+    def _is_control(ch):
+        """控制类字符判断
+        """
+        return unicodedata.category(ch) in ('Cc', 'Cf')
+
+    @staticmethod
+    def _is_special(ch):
+        """判断是不是有特殊含义的符号
+        """
+        return bool(ch) and (ch[0] == '[') and (ch[-1] == ']')
+
+    @staticmethod
+    def _is_redundant(token):
+        """判断该token是否冗余（默认情况下不可能分出来）
+        """
+        if len(token) > 1:
+            for ch in Tokenizer.stem(token):
+                if (
+                    Tokenizer._is_cjk_character(ch) or
+                    Tokenizer._is_punctuation(ch)
+                ):
+                    return True
+
+    def rematch(self, text, tokens):
+        """给出原始的text和tokenize后的tokens的映射关系
+        """
+        if is_py2:
+            text = unicode(text)
+
+        if self._do_lower_case:
+            text = text.lower()
+
+        normalized_text, char_mapping = '', []
+        for i, ch in enumerate(text):
+            if self._do_lower_case:
+                ch = lowercase_and_normalize(ch)
+            ch = ''.join([
+                c for c in ch
+                if not (ord(c) == 0 or ord(c) == 0xfffd or self._is_control(c))
+            ])
+            normalized_text += ch
+            char_mapping.extend([i] * len(ch))
+
+        text, token_mapping, offset = normalized_text, [], 0
+        for token in tokens:
+            if self._is_special(token):
+                token_mapping.append([])
+            else:
+                token = self.stem(token)
+                start = text[offset:].index(token) + offset
+                end = start + len(token)
+                token_mapping.append(char_mapping[start:end])
+                offset = end
+
+        return token_mapping
+
+
+class SpTokenizer(TokenizerBase):
+    """基于SentencePiece模型的封装，使用上跟Tokenizer基本一致。
+    """
+    def __init__(self, sp_model_path, **kwargs):
+        super(SpTokenizer, self).__init__(**kwargs)
+        import sentencepiece as spm
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(sp_model_path)
+        self._token_pad = self.sp_model.id_to_piece(self.sp_model.pad_id())
+        self._token_unk = self.sp_model.id_to_piece(self.sp_model.unk_id())
+        self._vocab_size = self.sp_model.get_piece_size()
+
+        for token in ['pad', 'unk', 'mask', 'start', 'end']:
+            try:
+                _token = getattr(self, '_token_%s' % token)
+                _token_id = self.sp_model.piece_to_id(_token)
+                setattr(self, '_token_%s_id' % token, _token_id)
+            except:
+                pass
+
+    def token_to_id(self, token):
+        """token转换为对应的id
+        """
+        return self.sp_model.piece_to_id(token)
+
+    def id_to_token(self, i):
+        """id转换为对应的token
+        """
+        if i < self._vocab_size:
+            return self.sp_model.id_to_piece(i)
+        else:
+            return ''
+
+    def decode(self, ids):
+        """转为可读文本
+        """
+        tokens = [
+            self._token_translate_inv.get(token) or token
+            for token in self.ids_to_tokens(ids)
+        ]
+        text = self.sp_model.decode_pieces(tokens)
+        return convert_to_unicode(text)
+
+    def _tokenize(self, text):
+        """基本分词函数
+        """
+        if self._pre_tokenize is not None:
+            text = ' '.join(self._pre_tokenize(text))
+
+        tokens = self.sp_model.encode_as_pieces(text)
+        return tokens
+
+    def _is_special(self, i):
+        """判断是不是有特殊含义的符号
+        """
+        return self.sp_model.is_control(i) or \
+            self.sp_model.is_unknown(i) or \
+            self.sp_model.is_unused(i)
+
+    def _is_decodable(self, i):
+        """判断是否应该被解码输出
+        """
+        return (i < self._vocab_size) and not self._is_special(i)
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_conditional_language_model.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_conditional_language_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..287d02a422048146215ffad213468b10acb640f9
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_conditional_language_model.py
@@ -0,0 +1,225 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# bert做conditional language model任务
+# 按类随机生成文本，这个demo的类别是情感极性（正／负）
+# 请参考：https://kexue.fm/archives/7124
+
+from __future__ import print_function
+import re
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, open
+from bert4keras.snippets import text_segmentate
+from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
+from bert4keras.snippets import uniout  # 打印中文
+from keras.layers import Input, Embedding, Reshape
+from keras.models import Model
+
+# 模型配置
+maxlen = 128
+batch_size = 32
+num_classes = 2
+epochs = 20
+
+# bert配置
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+
+
+def load_data(filenames):
+    """加载数据，并尽量划分为不超过maxlen的句子
+    """
+    D = []
+    seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+    for filename in filenames:
+        with open(filename, encoding='utf-8') as f:
+            for l in f:
+                text, label = l.strip().split('\t')
+                for t in text_segmentate(text, maxlen - 2, seps, strips):
+                    D.append((t, int(label)))
+    return D
+
+
+# 加载数据集
+data = load_data([
+    'datasets/sentiment/sentiment.train.data',
+    'datasets/sentiment/sentiment.valid.data',
+    'datasets/sentiment/sentiment.test.data',
+])
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+        for is_end, (text, label) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_labels.append([label])
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_labels = sequence_padding(batch_labels)
+                yield [batch_token_ids, batch_segment_ids, batch_labels], None
+                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+
+
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉padding部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_pred = inputs
+        if mask[1] is None:
+            y_mask = 1.0
+        else:
+            y_mask = K.cast(mask[1], K.floatx())[:, 1:]
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+
+
+c_in = Input(shape=(1,))
+c = Embedding(num_classes, 128)(c_in)
+c = Reshape((128,))(c)
+
+# Bert模型
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    application='lm',
+    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+    layer_norm_cond=c,
+    additional_input_layers=c_in,
+)
+
+output = CrossEntropy(1)([model.inputs[0], model.outputs[0]])
+
+model = Model(model.inputs, output)
+model.compile(optimizer=Adam(1e-5))
+model.summary()
+
+
+class RandomSentiment(AutoRegressiveDecoder):
+    """根据情感标签（0:负，1:正）随机生成一批句子
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids = output_ids
+        segment_ids = np.zeros_like(token_ids)
+        return self.last_token(model).predict([
+            token_ids, segment_ids, inputs[0]
+        ])
+
+    def generate(self, label, n=1, topp=0.95):
+        results = self.random_sample([[label]], n, topp=topp)  # 基于随机采样
+        return [tokenizer.decode(ids) for ids in results]
+
+
+random_sentiment = RandomSentiment(
+    start_id=tokenizer._token_start_id,
+    end_id=tokenizer._token_end_id,
+    maxlen=maxlen
+)
+
+
+def just_show():
+    print(u'正面采样:')
+    print(random_sentiment.generate(1, 5, 5), '\n')
+    print(u'负面采样:')
+    print(random_sentiment.generate(0, 5, 5), '\n')
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.lowest = 1e10
+
+    def on_epoch_end(self, epoch, logs=None):
+        # 保存最优
+        if logs['loss'] <= self.lowest:
+            self.lowest = logs['loss']
+            model.save_weights('./best_model.weights')
+        # 演示效果
+        just_show()
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
+"""
+正面采样:
+[
+    u'外观时尚、漂亮、性价比高。',
+    u'外观漂亮，配置均衡，比较满意，性价比高，外观漂亮，性能较高。',
+    u'我是在大学的时候看到这本书的，所以一直在买。书中的作者是林静蕾，她用自己的口吻写出了一个孩子成长中的心路历程，让我看到了她们成长中的不同之处，以及她们成长过程中的不同境界。让我很欣赏！',
+    u'我想这是一本能够告诉读者什么是坏的，而不是教你怎样说话，告诉我什么是错。这里我推荐了《我要讲故事》，这本书是我很喜欢的一本书，我认为它的理由很多，但是，我相信我。如果你从中得到一些改进，或者你已经有了一个明智的决定。',
+    u'我们一家五口住的是标间，大床房，大床的床很舒服；而我们在携程网上订了两套大床房，这个酒店的价格还是比较合理的；但是房间的隔音效果不太理想，有点响的声音；酒店门口的地铁在施工中，不方便；但是酒店的门口的出租车不知道是哪个车的，打车不是很方便；酒店外面的停'
+]
+
+负面采样:
+[
+    u'不知道是不是因为电池不太好，不是我不喜欢。',
+    u'看了评论才买的. 结果发现不是那么便宜, 价格也不便宜.',
+    u'1、外壳不容易沾手印，不容易洗洗2、屏幕有点旧， 不能下载铃声',
+    u'我是7月6日订购了《杜拉拉升职记》并已通过银行付款，为什么订单下了两周多至今还未到货？是收货时间太快了，可能就这么过去了吧？',
+    u'这本书我是在网上先看了一遍，后来我再看了一遍。感觉作者的文笔实在太烂了，特别是在写他的博客时特别别扭，写得很不专业，特别是他写股票时那个情绪调节的小男孩，简直就是自作聪明的样子，简直就是自作聪明的一种表现！'
+]
+"""
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_adversarial_training.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_adversarial_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ac1605070e610f82b6021fb90a595298ee85497
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_adversarial_training.py
@@ -0,0 +1,232 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding:utf-8 -*-
+# 通过对抗训练增强模型的泛化性能
+# 比CLUE榜单公开的同数据集上的BERT base的成绩高2%
+# 数据集：IFLYTEK' 长文本分类 (https://github.com/CLUEbenchmark/CLUE)
+# 博客：https://kexue.fm/archives/7234
+# 适用于Keras 2.3.1
+
+import json
+import numpy as np
+from bert4keras.backend import keras, search_layer, K
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, DataGenerator
+from keras.layers import Lambda, Dense
+from tqdm import tqdm
+
+num_classes = 119
+maxlen = 128
+batch_size = 32
+
+# BERT base
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：(文本, 标签id)
+    """
+    D = []
+    with open(filename) as f:
+        for i, l in enumerate(f):
+            l = json.loads(l)
+            text, label = l['sentence'], l['label']
+            D.append((text, int(label)))
+    return D
+
+
+# 加载数据集
+train_data = load_data(
+    '/root/CLUE-master/baselines/CLUEdataset/iflytek/train.json'
+)
+valid_data = load_data(
+    '/root/CLUE-master/baselines/CLUEdataset/iflytek/dev.json'
+)
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+        for is_end, (text, label) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_labels.append([label])
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_labels = sequence_padding(batch_labels)
+                yield [batch_token_ids, batch_segment_ids], batch_labels
+                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+
+
+# 转换数据集
+train_generator = data_generator(train_data, batch_size)
+valid_generator = data_generator(valid_data, batch_size)
+
+# 加载预训练模型
+bert = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    return_keras_model=False,
+)
+
+output = Lambda(lambda x: x[:, 0])(bert.model.output)
+output = Dense(
+    units=num_classes,
+    activation='softmax',
+    kernel_initializer=bert.initializer
+)(output)
+
+model = keras.models.Model(bert.model.input, output)
+model.summary()
+
+model.compile(
+    loss='sparse_categorical_crossentropy',
+    optimizer=Adam(2e-5),
+    metrics=['sparse_categorical_accuracy'],
+)
+
+
+def adversarial_training(model, embedding_name, epsilon=1):
+    """给模型添加对抗训练
+    其中model是需要添加对抗训练的keras模型，embedding_name
+    则是model里边Embedding层的名字。要在模型compile之后使用。
+    """
+    if model.train_function is None:  # 如果还没有训练函数
+        model._make_train_function()  # 手动make
+    old_train_function = model.train_function  # 备份旧的训练函数
+
+    # 查找Embedding层
+    for output in model.outputs:
+        embedding_layer = search_layer(output, embedding_name)
+        if embedding_layer is not None:
+            break
+    if embedding_layer is None:
+        raise Exception('Embedding layer not found')
+
+    # 求Embedding梯度
+    embeddings = embedding_layer.embeddings  # Embedding矩阵
+    gradients = K.gradients(model.total_loss, [embeddings])  # Embedding梯度
+    gradients = K.zeros_like(embeddings) + gradients[0]  # 转为dense tensor
+
+    # 封装为函数
+    inputs = (
+        model._feed_inputs + model._feed_targets + model._feed_sample_weights
+    )  # 所有输入层
+    embedding_gradients = K.function(
+        inputs=inputs,
+        outputs=[gradients],
+        name='embedding_gradients',
+    )  # 封装为函数
+
+    def train_function(inputs):  # 重新定义训练函数
+        grads = embedding_gradients(inputs)[0]  # Embedding梯度
+        delta = epsilon * grads / (np.sqrt((grads**2).sum()) + 1e-8)  # 计算扰动
+        K.set_value(embeddings, K.eval(embeddings) + delta)  # 注入扰动
+        outputs = old_train_function(inputs)  # 梯度下降
+        K.set_value(embeddings, K.eval(embeddings) - delta)  # 删除扰动
+        return outputs
+
+    model.train_function = train_function  # 覆盖原训练函数
+
+
+# 写好函数后，启用对抗训练只需要一行代码
+adversarial_training(model, 'Embedding-Token', 0.5)
+
+
+def evaluate(data):
+    total, right = 0., 0.
+    for x_true, y_true in data:
+        y_pred = model.predict(x_true).argmax(axis=1)
+        y_true = y_true[:, 0]
+        total += len(y_true)
+        right += (y_true == y_pred).sum()
+    return right / total
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+
+    def on_epoch_end(self, epoch, logs=None):
+        val_acc = evaluate(valid_generator)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            model.save_weights('best_model.weights')
+        print(
+            u'val_acc: %.5f, best_val_acc: %.5f\n' %
+            (val_acc, self.best_val_acc)
+        )
+
+
+def predict_to_file(in_file, out_file):
+    """输出预测结果到文件
+    结果文件可以提交到 https://www.cluebenchmarks.com 评测。
+    """
+    fw = open(out_file, 'w')
+    with open(in_file) as fr:
+        for l in tqdm(fr):
+            l = json.loads(l)
+            text = l['sentence']
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            label = model.predict([[token_ids], [segment_ids]])[0].argmax()
+            l = json.dumps({'id': str(l['id']), 'label': str(label)})
+            fw.write(l + '\n')
+    fw.close()
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=50,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('best_model.weights')
+    # predict_to_file('/root/CLUE-master/baselines/CLUEdataset/iflytek/test.json', 'iflytek_predict.json')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_bert_of_theseus.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_bert_of_theseus.py
new file mode 100644
index 0000000000000000000000000000000000000000..791dd1f03572ecb09b2c3db3d3f03d6caf130b43
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_bert_of_theseus.py
@@ -0,0 +1,257 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding:utf-8 -*-
+# 文本分类例子下的模型压缩
+# 方法为BERT-of-Theseus
+# 论文：https://arxiv.org/abs/2002.02925
+# 博客：https://kexue.fm/archives/7575
+
+import json
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open
+from keras.layers import Input, Lambda, Dense, Layer
+from keras.models import Model
+
+num_classes = 119
+maxlen = 128
+batch_size = 32
+
+# BERT base
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：(文本, 标签id)
+    """
+    D = []
+    with open(filename) as f:
+        for i, l in enumerate(f):
+            l = json.loads(l)
+            text, label = l['sentence'], l['label']
+            D.append((text, int(label)))
+    return D
+
+
+# 加载数据集
+train_data = load_data(
+    '/root/CLUE-master/baselines/CLUEdataset/iflytek/train.json'
+)
+valid_data = load_data(
+    '/root/CLUE-master/baselines/CLUEdataset/iflytek/dev.json'
+)
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+        for is_end, (text, label) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_labels.append([label])
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_labels = sequence_padding(batch_labels)
+                yield [batch_token_ids, batch_segment_ids], batch_labels
+                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+
+
+# 转换数据集
+train_generator = data_generator(train_data, batch_size)
+valid_generator = data_generator(valid_data, batch_size)
+
+
+class BinaryRandomChoice(Layer):
+    """随机二选一
+    """
+    def __init__(self, **kwargs):
+        super(BinaryRandomChoice, self).__init__(**kwargs)
+        self.supports_masking = True
+
+    def compute_mask(self, inputs, mask=None):
+        if mask is not None:
+            return mask[1]
+
+    def call(self, inputs):
+        source, target = inputs
+        mask = K.random_binomial(shape=[1], p=0.5)
+        output = mask * source + (1 - mask) * target
+        return K.in_train_phase(output, target)
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[1]
+
+
+def bert_of_theseus(predecessor, successor, classfier):
+    """bert of theseus
+    """
+    inputs = predecessor.inputs
+    # 固定住已经训练好的层
+    for layer in predecessor.model.layers:
+        layer.trainable = False
+    classfier.trainable = False
+    # Embedding层替换
+    predecessor_outputs = predecessor.apply_embeddings(inputs)
+    successor_outputs = successor.apply_embeddings(inputs)
+    outputs = BinaryRandomChoice()([predecessor_outputs, successor_outputs])
+    # Transformer层替换
+    layers_per_module = predecessor.num_hidden_layers // successor.num_hidden_layers
+    for index in range(successor.num_hidden_layers):
+        predecessor_outputs = outputs
+        for sub_index in range(layers_per_module):
+            predecessor_outputs = predecessor.apply_main_layers(
+                predecessor_outputs, layers_per_module * index + sub_index
+            )
+        successor_outputs = successor.apply_main_layers(outputs, index)
+        outputs = BinaryRandomChoice()([predecessor_outputs, successor_outputs])
+    # 返回模型
+    outputs = classfier(outputs)
+    model = Model(inputs, outputs)
+    return model
+
+
+def evaluate(data, model):
+    total, right = 0., 0.
+    for x_true, y_true in data:
+        y_pred = model.predict(x_true).argmax(axis=1)
+        y_true = y_true[:, 0]
+        total += len(y_true)
+        right += (y_true == y_pred).sum()
+    return right / total
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self, savename):
+        self.best_val_acc = 0.
+        self.savename = savename
+
+    def on_epoch_end(self, epoch, logs=None):
+        val_acc = evaluate(valid_generator, self.model)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            self.model.save_weights(self.savename)
+        print(
+            u'val_acc: %.5f, best_val_acc: %.5f\n' %
+            (val_acc, self.best_val_acc)
+        )
+
+
+# 加载预训练模型（12层）
+predecessor = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    return_keras_model=False,
+    prefix='Predecessor-'
+)
+
+# 加载预训练模型（3层）
+successor = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    return_keras_model=False,
+    num_hidden_layers=3,
+    prefix='Successor-'
+)
+
+# 判别模型
+x_in = Input(shape=K.int_shape(predecessor.output)[1:])
+x = Lambda(lambda x: x[:, 0])(x_in)
+x = Dense(units=num_classes, activation='softmax')(x)
+classfier = Model(x_in, x)
+
+predecessor_model = Model(predecessor.inputs, classfier(predecessor.output))
+predecessor_model.compile(
+    loss='sparse_categorical_crossentropy',
+    optimizer=Adam(2e-5),  # 用足够小的学习率
+    metrics=['sparse_categorical_accuracy'],
+)
+predecessor_model.summary()
+
+successor_model = Model(successor.inputs, classfier(successor.output))
+successor_model.compile(
+    loss='sparse_categorical_crossentropy',
+    optimizer=Adam(2e-5),  # 用足够小的学习率
+    metrics=['sparse_categorical_accuracy'],
+)
+successor_model.summary()
+
+theseus_model = bert_of_theseus(predecessor, successor, classfier)
+theseus_model.compile(
+    loss='sparse_categorical_crossentropy',
+    optimizer=Adam(2e-5),  # 用足够小的学习率
+    metrics=['sparse_categorical_accuracy'],
+)
+theseus_model.summary()
+
+if __name__ == '__main__':
+
+    # 训练predecessor
+    predecessor_evaluator = Evaluator('best_predecessor.weights')
+    predecessor_model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=5,
+        callbacks=[predecessor_evaluator]
+    )
+
+    # 训练theseus
+    theseus_evaluator = Evaluator('best_theseus.weights')
+    theseus_model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=10,
+        callbacks=[theseus_evaluator]
+    )
+    theseus_model.load_weights('best_theseus.weights')
+
+    # 训练successor
+    successor_evaluator = Evaluator('best_successor.weights')
+    successor_model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=5,
+        callbacks=[successor_evaluator]
+    )
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_gradient_penalty.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_gradient_penalty.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ce2c6e8fd0fee9cbf400e77c0ecbad6a048fd73
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_gradient_penalty.py
@@ -0,0 +1,205 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding:utf-8 -*-
+# 通过梯度惩罚增强模型的泛化性能
+# 比CLUE榜单公开的同数据集上的BERT base的成绩高2%
+# 数据集：IFLYTEK' 长文本分类 (https://github.com/CLUEbenchmark/CLUE)
+# 博客：https://kexue.fm/archives/7234
+# 适用于Keras 2.3.1
+
+import json
+import numpy as np
+from bert4keras.backend import keras, search_layer, K
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, DataGenerator
+from keras.layers import Lambda, Dense
+from tqdm import tqdm
+
+num_classes = 119
+maxlen = 128
+batch_size = 32
+
+# BERT base
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：(文本, 标签id)
+    """
+    D = []
+    with open(filename) as f:
+        for i, l in enumerate(f):
+            l = json.loads(l)
+            text, label = l['sentence'], l['label']
+            D.append((text, int(label)))
+    return D
+
+
+# 加载数据集
+train_data = load_data(
+    '/root/CLUE-master/baselines/CLUEdataset/iflytek/train.json'
+)
+valid_data = load_data(
+    '/root/CLUE-master/baselines/CLUEdataset/iflytek/dev.json'
+)
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+        for is_end, (text, label) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_labels.append([label])
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_labels = sequence_padding(batch_labels)
+                yield [batch_token_ids, batch_segment_ids], batch_labels
+                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+
+
+# 转换数据集
+train_generator = data_generator(train_data, batch_size)
+valid_generator = data_generator(valid_data, batch_size)
+
+# 加载预训练模型
+bert = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    return_keras_model=False,
+)
+
+output = Lambda(lambda x: x[:, 0])(bert.model.output)
+output = Dense(
+    units=num_classes,
+    activation='softmax',
+    kernel_initializer=bert.initializer
+)(output)
+
+model = keras.models.Model(bert.model.input, output)
+model.summary()
+
+
+def sparse_categorical_crossentropy(y_true, y_pred):
+    """自定义稀疏交叉熵
+    这主要是因为keras自带的sparse_categorical_crossentropy不支持求二阶梯度。
+    """
+    y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
+    y_true = K.cast(y_true, 'int32')
+    y_true = K.one_hot(y_true, K.shape(y_pred)[-1])
+    return K.categorical_crossentropy(y_true, y_pred)
+
+
+def loss_with_gradient_penalty(y_true, y_pred, epsilon=1):
+    """带梯度惩罚的loss
+    """
+    loss = K.mean(sparse_categorical_crossentropy(y_true, y_pred))
+    embeddings = search_layer(y_pred, 'Embedding-Token').embeddings
+    gp = K.sum(K.gradients(loss, [embeddings])[0].values**2)
+    return loss + 0.5 * epsilon * gp
+
+
+model.compile(
+    loss=loss_with_gradient_penalty,
+    optimizer=Adam(2e-5),
+    metrics=['sparse_categorical_accuracy'],
+)
+
+
+def evaluate(data):
+    total, right = 0., 0.
+    for x_true, y_true in data:
+        y_pred = model.predict(x_true).argmax(axis=1)
+        y_true = y_true[:, 0]
+        total += len(y_true)
+        right += (y_true == y_pred).sum()
+    return right / total
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+
+    def on_epoch_end(self, epoch, logs=None):
+        val_acc = evaluate(valid_generator)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            model.save_weights('best_model.weights')
+        print(
+            u'val_acc: %.5f, best_val_acc: %.5f\n' %
+            (val_acc, self.best_val_acc)
+        )
+
+
+def predict_to_file(in_file, out_file):
+    """输出预测结果到文件
+    结果文件可以提交到 https://www.cluebenchmarks.com 评测。
+    """
+    fw = open(out_file, 'w')
+    with open(in_file) as fr:
+        for l in tqdm(fr):
+            l = json.loads(l)
+            text = l['sentence']
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            label = model.predict([[token_ids], [segment_ids]])[0].argmax()
+            l = json.dumps({'id': str(l['id']), 'label': str(label)})
+            fw.write(l + '\n')
+    fw.close()
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=50,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('best_model.weights')
+    # predict_to_file('/root/CLUE-master/baselines/CLUEdataset/iflytek/test.json', 'iflytek_predict.json')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_multigpu.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_multigpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..4460f08bede6840787145218ab1a81914f951ca4
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_iflytek_multigpu.py
@@ -0,0 +1,191 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding:utf-8 -*-
+# 文本分类多gpu版
+# 数据集：IFLYTEK' 长文本分类 (https://github.com/CLUEbenchmark/CLUE)
+
+import os
+
+os.environ['TF_KERAS'] = '1'  # 必须使用tf.keras
+
+import json
+import numpy as np
+import tensorflow as tf
+from bert4keras.backend import keras, K
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, DataGenerator, to_array
+from keras.layers import Lambda, Dense
+from tqdm import tqdm
+
+num_classes = 119
+maxlen = 128
+batch_size = 32
+
+# BERT base
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：(文本, 标签id)
+    """
+    D = []
+    with open(filename) as f:
+        for i, l in enumerate(f):
+            l = json.loads(l)
+            text, label = l['sentence'], l['label']
+            D.append((text, int(label)))
+    return D
+
+
+# 加载数据集
+train_data = load_data(
+    '/root/CLUE-master/baselines/CLUEdataset/iflytek/train.json'
+)
+valid_data = load_data(
+    '/root/CLUE-master/baselines/CLUEdataset/iflytek/dev.json'
+)
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        for is_end, (text, label) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            yield [token_ids, segment_ids], [[label]]  # 返回一条样本
+
+
+# 转换数据集
+train_generator = data_generator(train_data, batch_size)
+valid_generator = data_generator(valid_data, batch_size)
+
+# 建立单机多卡策略
+strategy = tf.distribute.MirroredStrategy()
+
+with strategy.scope():  # 调用该策略
+
+    # 加载预训练模型
+    bert = build_transformer_model(
+        config_path=config_path,
+        checkpoint_path=None,
+        return_keras_model=False,
+    )
+
+    output = Lambda(lambda x: x[:, 0])(bert.model.output)
+    output = Dense(
+        units=num_classes,
+        activation='softmax',
+        kernel_initializer=bert.initializer,
+        name='Probas'
+    )(output)
+
+    model = keras.models.Model(bert.model.input, output)
+    model.compile(
+        loss='sparse_categorical_crossentropy',
+        optimizer=Adam(2e-5),
+        metrics=['sparse_categorical_accuracy'],
+    )
+    model.summary()
+    bert.load_weights_from_checkpoint(checkpoint_path)  # 必须最后才加载预训练权重
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+
+    def on_epoch_end(self, epoch, logs=None):
+        val_acc = logs['sparse_categorical_accuracy']
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            model.save_weights('best_model.weights')
+        print(
+            u'val_acc: %.5f, best_val_acc: %.5f\n' %
+            (val_acc, self.best_val_acc)
+        )
+
+
+def predict_to_file(in_file, out_file):
+    """输出预测结果到文件
+    结果文件可以提交到 https://www.cluebenchmarks.com 评测。
+    """
+    fw = open(out_file, 'w')
+    with open(in_file) as fr:
+        for l in tqdm(fr):
+            l = json.loads(l)
+            text = l['sentence']
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            token_ids, segment_ids = to_array([token_ids], [segment_ids])
+            label = model.predict([token_ids, segment_ids])[0].argmax()
+            l = json.dumps({'id': str(l['id']), 'label': str(label)})
+            fw.write(l + '\n')
+    fw.close()
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+
+    train_dataset = train_generator.to_dataset(
+        types=[('float32', 'float32'), ('float32',)],
+        shapes=[([None], [None]), ([1],)],  # 配合后面的padded_batch=True，实现自动padding
+        names=[('Input-Token', 'Input-Segment'), ('Probas',)],
+        padded_batch=True
+    )  # 数据要转为tf.data.Dataset格式，names跟输入层/输出层的名字对应
+
+    valid_dataset = valid_generator.to_dataset(
+        types=[('float32', 'float32'), ('float32',)],
+        shapes=[([None], [None]), ([1],)],  # 配合后面的padded_batch=True，实现自动padding
+        names=[('Input-Token', 'Input-Segment'), ('Probas',)],
+        padded_batch=True
+    )  # 数据要转为tf.data.Dataset格式，names跟输入层/输出层的名字对应
+
+    model.fit(
+        train_dataset,
+        steps_per_epoch=len(train_generator),
+        epochs=10,
+        validation_data=valid_dataset,
+        validation_steps=len(valid_generator),
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('best_model.weights')
+    # predict_to_file('/root/CLUE-master/baselines/CLUEdataset/iflytek/test.json', 'iflytek_predict.json')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_image_caption.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_image_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..31ca8dc11022579d94ac1d1342b315067a3a1cad
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_image_caption.py
@@ -0,0 +1,266 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# bert做image caption任务，coco数据集
+# 通过Conditional Layer Normalization融入条件信息
+# 请参考：https://kexue.fm/archives/7124
+
+from __future__ import print_function
+import json
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, is_string
+from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
+from keras.models import Model
+import cv2
+
+# 模型配置
+maxlen = 64
+batch_size = 32
+steps_per_epoch = 1000
+epochs = 10000
+
+# bert配置
+config_path = '/root/kg/bert/uncased_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/uncased_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/uncased_L-12_H-768_A-12/vocab.txt'
+
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+
+
+def read_caption(f):
+    """读取并整理COCO的Caption数据
+    """
+    data = json.load(open(f))
+    images = {}
+    for img in data['images']:
+        images[img['id']] = {
+            'image_id': img['file_name'],
+            'caption': [],
+            'url': img['coco_url']
+        }
+    for caption in data['annotations']:
+        images[caption['image_id']]['caption'].append(caption['caption'])
+    return list(images.values())
+
+
+def read_image(f):
+    """单图读取函数（对非方形的图片进行白色填充，使其变为方形）
+    """
+    img = cv2.imread(f)
+    height, width = img.shape[:2]
+    if height > width:
+        height, width = img_size, width * img_size // height
+        img = cv2.resize(img, (width, height))
+        delta = (height - width) // 2
+        img = cv2.copyMakeBorder(
+            img,
+            top=0,
+            bottom=0,
+            left=delta,
+            right=height - width - delta,
+            borderType=cv2.BORDER_CONSTANT,
+            value=[255, 255, 255]
+        )
+    else:
+        height, width = height * img_size // width, img_size
+        img = cv2.resize(img, (width, height))
+        delta = (width - height) // 2
+        img = cv2.copyMakeBorder(
+            img,
+            top=delta,
+            bottom=width - height - delta,
+            left=0,
+            right=0,
+            borderType=cv2.BORDER_CONSTANT,
+            value=[255, 255, 255]
+        )
+    img = img.astype('float32')
+    return img[..., ::-1]  # cv2的读取模式为BGR，但keras的模型要求为RGB
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_images, batch_token_ids, batch_segment_ids = [], [], []
+        for is_end, D in self.sample(random):
+            img = '/root/caption/coco/train2014/%s' % D['image_id']
+            caption = np.random.choice(D['caption'])
+            token_ids, segment_ids = tokenizer.encode(caption, maxlen=maxlen)
+            batch_images.append(read_image(img))
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_images = np.array(batch_images)
+                batch_images = preprocess_input(batch_images)
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                yield [batch_token_ids, batch_segment_ids, batch_images], None
+                batch_images, batch_token_ids, batch_segment_ids = [], [], []
+
+
+# 加载数据
+train_data = read_caption(
+    '/root/caption/coco/annotations/captions_train2014.json'
+)
+valid_data = read_caption(
+    '/root/caption/coco/annotations/captions_val2014.json'
+)
+
+
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉padding部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_pred = inputs
+        if mask[1] is None:
+            y_mask = 1.0
+        else:
+            y_mask = K.cast(mask[1], K.floatx())[:, 1:]
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+
+
+# 图像模型
+MobileNetV2 = keras.applications.mobilenet_v2.MobileNetV2
+preprocess_input = keras.applications.mobilenet_v2.preprocess_input
+image_model = MobileNetV2(include_top=False, pooling='avg')
+img_size = 299
+
+# Bert模型
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    application='lm',
+    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+    layer_norm_cond=image_model.output,
+    layer_norm_cond_hidden_size=128,
+    layer_norm_cond_hidden_act='swish',
+    additional_input_layers=image_model.input,
+)
+
+output = CrossEntropy(1)([model.inputs[0], model.outputs[0]])
+
+model = Model(model.inputs, output)
+model.compile(optimizer=Adam(1e-5))
+model.summary()
+
+
+class AutoCaption(AutoRegressiveDecoder):
+    """img2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        image = inputs[0]
+        token_ids = output_ids
+        segment_ids = np.zeros_like(token_ids)
+        return self.last_token(model).predict([token_ids, segment_ids, image])
+
+    def generate(self, image, topk=1):
+        if is_string(image):
+            image = read_image(image)
+        image = preprocess_input(image)
+        output_ids = self.beam_search([image], topk=topk)  # 基于beam search
+        return tokenizer.decode(output_ids)
+
+
+autocaption = AutoCaption(
+    start_id=tokenizer._token_start_id,
+    end_id=tokenizer._token_end_id,
+    maxlen=maxlen
+)
+
+
+def just_show():
+    samples = [valid_data[i] for i in np.random.choice(len(valid_data), 2)]
+    for D in samples:
+        img = '/root/caption/coco/val2014/%s' % D['image_id']
+        print(u'image_id:', D['image_id'])
+        print(u'url:', D['url'])
+        print(u'predict:', autocaption.generate(img))
+        print(u'references:', D['caption'])
+        print()
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.lowest = 1e10
+
+    def on_epoch_end(self, epoch, logs=None):
+        # 保存最优
+        if logs['loss'] <= self.lowest:
+            self.lowest = logs['loss']
+            model.save_weights('./best_model.weights')
+        # 演示效果
+        just_show()
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(train_data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
+"""
+image_id: COCO_val2014_000000524611.jpg
+url: http://images.cocodataset.org/val2014/COCO_val2014_000000524611.jpg
+predict: a train that is sitting on the tracks.
+references: [u'A train carrying chemical tanks traveling past a water tower.', u'Dual train tracks with a train on one of them and a water tower in the background.', u'a train some trees and a water tower ', u'Train on tracks with water tower for Davis Junction in the rear.', u'A train on a train track going through a bunch of trees.']
+
+image_id: COCO_val2014_000000202923.jpg
+url: http://images.cocodataset.org/val2014/COCO_val2014_000000202923.jpg
+predict: a baseball game in progress with the batter up to plate.
+references: [u'Batter, catcher, and umpire anticipating the next pitch.', u'A baseball player holding a baseball bat in the game.', u'A baseball player stands ready at the plate.', u'Baseball players on the field ready for the pitch.', u'A view from behind a mesh fence of a baseball game.']
+"""
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_language_model.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_language_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddfd36d1a3e9fa775adeb463d0e703ac55548458
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_language_model.py
@@ -0,0 +1,220 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# bert做language model任务，小说生成
+
+from __future__ import print_function
+import glob, re
+import numpy as np
+from tqdm import tqdm
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, open
+from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
+from keras.models import Model
+
+maxlen = 256
+batch_size = 16
+steps_per_epoch = 1000
+epochs = 10000
+
+# bert配置
+config_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
+
+novels = []
+
+for txt in glob.glob('/root/金庸/*/*.txt'):
+    txt = open(txt, encoding='gbk').read()
+    txt = txt.replace('\r', '').replace('\n', '')
+    txt = txt.replace(u'整理制作，并提供下载', '')
+    txt = re.sub(u'www.*?com', '', txt)
+    txt = txt.replace(u'\u3000', ' ')
+    sents = []
+    for t in txt.split('  '):
+        for s in re.findall(u'.*?。', t):
+            if len(s) <= maxlen - 2:
+                sents.append(s)
+    novels.append(sents)
+
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+
+data = []
+pbar = tqdm(desc=u'构建语料中', total=sum(len(n) for n in novels))
+
+for novel in novels:
+    s = u''
+    for i in range(len(novel)):
+        for j in range(len(novel) - i):
+            if len(s) + len(novel[i + j]) > maxlen - 2:
+                data.append(s)
+                s = u''
+                break
+            else:
+                s += novel[i + j]
+        pbar.update(1)
+        if i + j >= len(novel):
+            break
+    if s:
+        data.append(s)
+
+pbar.close()
+np.random.shuffle(data)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids = [], []
+        for is_end, text in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(text)
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                yield [batch_token_ids, batch_segment_ids], None
+                batch_token_ids, batch_segment_ids = [], []
+
+
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉padding部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_pred = inputs
+        if mask[1] is None:
+            y_mask = 1.0
+        else:
+            y_mask = K.cast(mask[1], K.floatx())[:, 1:]
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    application='lm',
+    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+)
+
+output = CrossEntropy(1)([model.inputs[0], model.outputs[0]])
+
+model = Model(model.inputs, output)
+model.compile(optimizer=Adam(1e-5))
+model.summary()
+
+
+class StoryCompletion(AutoRegressiveDecoder):
+    """基于随机采样的故事续写
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids = inputs[0]
+        token_ids = np.concatenate([token_ids, output_ids], 1)
+        segment_ids = np.zeros_like(token_ids)
+        return self.last_token(model).predict([token_ids, segment_ids])
+
+    def generate(self, text, n=1, topp=0.95):
+        token_ids, _ = tokenizer.encode(text)
+        results = self.random_sample([token_ids[:-1]], n, topp=topp)  # 基于随机采样
+        return [text + tokenizer.decode(ids) for ids in results]
+
+
+story_completion = StoryCompletion(
+    start_id=None, end_id=tokenizer._token_end_id, maxlen=maxlen
+)
+
+
+def just_show():
+    s1 = u'当晚两人在一家小客店中宿歇。张无忌躺在炕上，越想越是担心，走到赵敏窗外，但听她呼吸调匀，正自香梦沉酣。'
+    s2 = u'虚竹飞身跃上松树的枝干，只见段延庆的钢杖深深嵌在树枝之中，全凭一股内力粘劲，挂住了下面四人，内力之深厚，实是非同小可。虚竹伸左手抓住钢杖，提将上来。'
+    s3 = u'杨过居住在侠客岛，是令狐冲的弟子，武器是金蛇剑。'
+    for s in [s1, s2, s3]:
+        t = story_completion.generate(s)
+        print(u'输入: %s' % s)
+        print(u'结果: %s\n' % ('\n'.join(t)))
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.lowest = 1e10
+
+    def on_epoch_end(self, epoch, logs=None):
+        # 保存最优
+        if logs['loss'] <= self.lowest:
+            self.lowest = logs['loss']
+            model.save_weights('./best_model.weights')
+        # 演示效果
+        just_show()
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
+"""
+效果：
+
+输入: 当晚两人在一家小客店中宿歇。张无忌躺在炕上，越想越是担心，走到赵敏窗外，但听她呼吸调匀，正自香梦沉酣。
+结果: 当晚两人在一家小客店中宿歇。张无忌躺在炕上，越想越是担心，走到赵敏窗外，但听她呼吸调匀，正自香梦沉酣。次日清晨，张无忌便和赵敏去买了一匹高头大马，自己骑了随伴。那马甚有神骏，三十六斤重的身躯之中，竟无一头白马。他心中怦怦乱跳，暗想：若能将赵敏引出迷城，我决不致再和她相会，但若和赵姑娘相遇，我一生一世决计再难相见。何况我是她的私生女儿，这般亲热，岂不是好？我如何能和她相见？今后我要教训教训她才好？我教教她，教训她，要她心里快快活活的。他心如刀割，当即回到客店，将张无忌的所在说了。
+
+输入: 虚竹飞身跃上松树的枝干，只见段延庆的钢杖深深嵌在树枝之中，全凭一股内力粘劲，挂住了下面四人，内力之深厚，实是非同小可。虚竹伸左手抓住钢杖，提将上来。
+结果: 虚竹飞身跃上松树的枝干，只见段延庆的钢杖深深嵌在树枝之中，全凭一股内力粘劲，挂住了下面四人，内力之深厚，实是非同小可。虚竹伸左手抓住钢杖，提将上来。那矮子见他如此功力，大吃一惊，叫道：什么人？是谁？你干什么？我师父是谁？你们是谁？是谁？你们是谁？我师父是谁？你这矮子，便是段延庆。你们不知道我师父便是，是不是？快快说来。那矮子道：我师父便是延庆太子，他的徒弟也是段延庆。他老人家在唐朝做镇南王，你们便将他改名为延庆太子，叫做延庆太子！这名头倒怪，你们大伙儿听见了，也不知道他老人家是死是活。
+
+输入: 杨过居住在侠客岛，是令狐冲的弟子，武器是金蛇剑。
+结果: 杨过居住在侠客岛，是令狐冲的弟子，武器是金蛇剑。这时见他手中所握，竟是一柄特制的短剑，心中大喜，叫道：：原来是金蛇郎君的剑！原来你便是金蛇郎君的弟子，这一下可要叫我失望了。那人哈哈一笑，说道：好啊！好啊，好啊！我的金蛇剑是我的，不过我是你的。这人道：我姓杨名过，名字叫过。你是我儿子，是我女儿，是不是？你这么大的年纪，怎地自称金刀驸马？我这就给你取个名字，叫作过儿。
+"""
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_language_model_chinese_chess.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_language_model_chinese_chess.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e49831c3cd4d32d9ddcd7cbb7f4f52fb9b59cdb
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_language_model_chinese_chess.py
@@ -0,0 +1,270 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 用 语言模型+棋谱 的方式监督训练一个下中国象棋模型
+# 介绍：https://kexue.fm/archives/7877
+# 数据：https://github.com/bojone/gpt_cchess
+# 模型训练可以在python2/python3进行。但是cchess模块只支持python3，
+# 因此如果需要交互式体验模型棋力，那么需要在python3下进行。
+
+import json
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, open
+from bert4keras.snippets import DataGenerator
+from keras.models import Model
+from cchess import *
+
+# 基本信息
+maxlen = 512
+steps_per_epoch = 1000
+epochs = 10000
+batch_size = 16
+
+# bert配置
+config_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """读取全局棋谱
+    返回：[(棋谱, 结果)]，其中结果等于2为红方赢棋，1为和棋，
+    0为黑方赢棋，-1则为无明确标注胜负。
+    """
+    D = []
+    with open(filename) as f:
+        for l in f:
+            l = json.loads(l)
+            if not l['fen']:
+                result = int(l['items'].get(u'棋局结果', -1))
+                D.append((l['iccs'], result))
+    return D
+
+
+# 加载数据
+data = load_data('/root/qipu.json')
+
+# 建立分词器
+chars = [u'[PAD]'] + list(u'0123456789abcdefghi')
+token_dict = dict(zip(chars, range(len(chars))))
+tokenizer = Tokenizer(token_dict)
+tokenizer._token_unk_id = 0
+bert_token_dict = load_vocab(dict_path)
+keep_tokens = [bert_token_dict[c] for c in chars]
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids = [], []
+        for is_end, (text, label) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(
+                ' '.join(text), maxlen=maxlen // self.n + 1
+            )
+            batch_token_ids.append([0] + token_ids[1:-1])
+            batch_segment_ids.append([0] + segment_ids[1:-1])
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                yield [batch_token_ids, batch_segment_ids], None
+                batch_token_ids, batch_segment_ids = [], []
+                self.count += 1
+
+    @property
+    def n(self):
+        if not hasattr(self, 'count'):
+            self.count = 0
+        if self.count < 20000:
+            n = 8
+        elif self.count < 40000:
+            n = 4
+        elif self.count < 80000:
+            n = 2
+        else:
+            n = 1
+        return n
+
+
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉padding部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_pred = inputs
+        if mask[1] is None:
+            y_mask = 1.0
+        else:
+            y_mask = K.cast(mask[1], K.floatx())[:, 1:]
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    application='lm',
+    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+)
+
+output = CrossEntropy(1)([model.inputs[0], model.outputs[0]])
+
+model = Model(model.inputs, output)
+model.compile(optimizer=Adam(1e-5))
+model.summary()
+
+
+class ChessPlayer(object):
+    """交互式下棋程序
+    """
+    def move_to_chinese(self, move):
+        """将单步走法转为中文描述
+        """
+        if not isinstance(move, Move):
+            move = Move(self.board, move[0], move[1])
+        return move.to_chinese()
+
+    def move_to_iccs(self, move):
+        """将单步走法转为iccs表示
+        """
+        if not isinstance(move, Move):
+            move = Move(self.board, move[0], move[1])
+        return move.to_iccs()
+
+    def print_board(self):
+        """打印当前棋盘
+        直观起见，红方用红色表示，黑方用绿色表示。
+        """
+        for l in self.board.dump_board():
+            for c in u'兵炮车马相仕帅':
+                l = l.replace(c, u'\033[1;31;40m%s\033[0m' % c)
+            for c in u'卒砲砗碼象士将':
+                l = l.replace(c, u'\033[1;32;40m%s\033[0m' % c)
+            print(l)
+
+    def movable_steps(self):
+        """给出当前局面所有候选走法
+        """
+        return [self.move_to_iccs(m) for m in self.board.create_moves()]
+
+    def human_input(self):
+        """人类行棋
+        """
+        while True:
+            try:
+                iccs = input(u'请输入iccs棋着: ')
+                print(iccs)
+                move = self.board.move_iccs(iccs)
+                if move is not None:
+                    return iccs, move
+            except KeyboardInterrupt:
+                return None
+            except:
+                pass
+
+    def record(self, iccs):
+        """将局面往前推进一步
+        """
+        self.history += iccs
+        self.board.next_turn()
+        self.print_board()
+        self.current = (self.current + 1) % 2
+
+    def new_game(self, current=0):
+        """开新局
+        """
+        self.board = ChessBoard()
+        self.board.from_fen(FULL_INIT_FEN)
+        self.print_board()
+        self.history = ''
+        self.current = current
+        if self.current == 0:  # 人类先手
+            iccs, move = self.human_input()
+            self.record(iccs)
+        while True:
+            # 机器走棋
+            moves = self.movable_steps()
+            iccses = [' '.join(self.history + m) for m in moves]
+            token_ids = [[0] + tokenizer.encode(ic)[0][1:-1] for ic in iccses]
+            token_ids = np.array(token_ids)
+            segment_ids = np.zeros_like(token_ids)
+            preds = model.predict([token_ids, segment_ids])[:, -5:-1]
+            preds = np.take_along_axis(preds, token_ids[:, -4:, None], axis=2)
+            preds = np.log(preds + 1e-8)[:, :, 0].sum(axis=1)
+            iccs = moves[preds.argmax()]
+            move = self.board.move_iccs(iccs)
+            self.record(iccs)
+            if self.board.is_win():
+                print(u'机器赢了')
+                break
+            # 人类走棋
+            iccs, move = self.human_input()
+            self.record(iccs)
+            if self.board.is_win():
+                print(u'人类赢了')
+                break
+
+
+chessplayer = ChessPlayer()
+"""
+chessplayer.new_game(0)  # 启动新棋局，0为人类先手，1为机器先手
+"""
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def on_epoch_end(self, epoch, logs=None):
+        # 保存模型
+        model.save_weights('./best_model.weights')
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_question_answer_generation_by_seq2seq.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_question_answer_generation_by_seq2seq.py
new file mode 100644
index 0000000000000000000000000000000000000000..a90cb3b8f870348f4ac1cbf058434e1c9cb0bb10
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_question_answer_generation_by_seq2seq.py
@@ -0,0 +1,213 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8- -*-
+# 用Seq2Seq做阅读理解构建
+# 根据篇章先采样生成答案，然后采样生成问题
+# 数据集同 https://github.com/bojone/dgcnn_for_reading_comprehension
+
+import json, os
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, open
+from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
+from bert4keras.snippets import text_segmentate
+from keras.models import Model
+from tqdm import tqdm
+
+# 基本参数
+max_p_len = 128
+max_q_len = 64
+max_a_len = 16
+batch_size = 32
+epochs = 100
+
+# bert配置
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+# 标注数据
+webqa_data = json.load(open('/root/qa_datasets/WebQA.json'))
+sogou_data = json.load(open('/root/qa_datasets/SogouQA.json'))
+
+# 筛选数据
+seps, strips = u'\n。！？!?；;，, ', u'；;，, '
+data = []
+for d in webqa_data + sogou_data:
+    for p in d['passages']:
+        if p['answer']:
+            for t in text_segmentate(p['passage'], max_p_len - 2, seps, strips):
+                if p['answer'] in t:
+                    data.append((t, d['question'], p['answer']))
+
+del webqa_data
+del sogou_data
+
+# 保存一个随机序（供划分valid用）
+if not os.path.exists('../random_order.json'):
+    random_order = list(range(len(data)))
+    np.random.shuffle(random_order)
+    json.dump(random_order, open('../random_order.json', 'w'), indent=4)
+else:
+    random_order = json.load(open('../random_order.json'))
+
+# 划分valid
+train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
+valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
+
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        """单条样本格式：[CLS]篇章[SEP]答案[SEP]问题[SEP]
+        """
+        batch_token_ids, batch_segment_ids = [], []
+        for is_end, (p, q, a) in self.sample(random):
+            p_token_ids, _ = tokenizer.encode(p, maxlen=max_p_len + 1)
+            a_token_ids, _ = tokenizer.encode(a, maxlen=max_a_len)
+            q_token_ids, _ = tokenizer.encode(q, maxlen=max_q_len)
+            token_ids = p_token_ids + a_token_ids[1:] + q_token_ids[1:]
+            segment_ids = [0] * len(p_token_ids)
+            segment_ids += [1] * (len(token_ids) - len(p_token_ids))
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                yield [batch_token_ids, batch_segment_ids], None
+                batch_token_ids, batch_segment_ids = [], []
+
+
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉输入部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_mask, y_pred = inputs
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    application='unilm',
+    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+)
+
+output = CrossEntropy(2)(model.inputs + model.outputs)
+
+model = Model(model.inputs, output)
+model.compile(optimizer=Adam(1e-5))
+model.summary()
+
+
+class QuestionAnswerGeneration(AutoRegressiveDecoder):
+    """随机生成答案，并且通过beam search来生成问题
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids, segment_ids = inputs
+        token_ids = np.concatenate([token_ids, output_ids], 1)
+        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
+        return self.last_token(model).predict([token_ids, segment_ids])
+
+    def generate(self, passage, topk=1, topp=0.95):
+        token_ids, segment_ids = tokenizer.encode(passage, maxlen=max_p_len)
+        a_ids = self.random_sample([token_ids, segment_ids], 1,
+                                   topp=topp)[0]  # 基于随机采样
+        token_ids += list(a_ids)
+        segment_ids += [1] * len(a_ids)
+        q_ids = self.beam_search([token_ids, segment_ids],
+                                 topk=topk)  # 基于beam search
+        return (tokenizer.decode(q_ids), tokenizer.decode(a_ids))
+
+
+qag = QuestionAnswerGeneration(
+    start_id=None, end_id=tokenizer._token_end_id, maxlen=max_q_len
+)
+
+
+def predict_to_file(data, filename, topk=1):
+    """将预测结果输出到文件，方便评估
+    """
+    with open(filename, 'w', encoding='utf-8') as f:
+        for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)):
+            q, a = qag.generate(d[0])
+            s = '%s\t%s\t%s\n' % (q, a, d[0])
+            f.write(s)
+            f.flush()
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.lowest = 1e10
+
+    def on_epoch_end(self, epoch, logs=None):
+        # 保存最优
+        if logs['loss'] <= self.lowest:
+            self.lowest = logs['loss']
+            model.save_weights('./best_model.weights')
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(train_data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=1000,
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
+    # predict_to_file(valid_data, 'qa.csv')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_reading_comprehension_by_mlm.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_reading_comprehension_by_mlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..326fff4118be31bb288e55061ec1f0b37673c265
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_reading_comprehension_by_mlm.py
@@ -0,0 +1,258 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 用MLM的方式做阅读理解任务
+# 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension
+# 10个epoch后在valid上能达到约0.77的分数
+# (Accuracy=0.7282149325820084  F1=0.8207266829447049   Final=0.7744708077633566)
+
+import json, os, re
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open
+from keras.layers import Lambda
+from keras.models import Model
+from tqdm import tqdm
+
+max_p_len = 256
+max_q_len = 64
+max_a_len = 32
+batch_size = 32
+epochs = 10
+
+# bert配置
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+# 标注数据
+webqa_data = json.load(open('/root/qa_datasets/WebQA.json'))
+sogou_data = json.load(open('/root/qa_datasets/SogouQA.json'))
+
+# 保存一个随机序（供划分valid用）
+if not os.path.exists('../random_order.json'):
+    random_order = list(range(len(sogou_data)))
+    np.random.shuffle(random_order)
+    json.dump(random_order, open('../random_order.json', 'w'), indent=4)
+else:
+    random_order = json.load(open('../random_order.json'))
+
+# 划分valid
+train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0]
+valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0]
+train_data.extend(train_data)
+train_data.extend(webqa_data)  # 将SogouQA和WebQA按2:1的比例混合
+
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        """单条样本格式为
+        输入：[CLS][MASK][MASK][SEP]问题[SEP]篇章[SEP]
+        输出：答案
+        """
+        batch_token_ids, batch_segment_ids, batch_a_token_ids = [], [], []
+        for is_end, D in self.sample(random):
+            question = D['question']
+            answers = [p['answer'] for p in D['passages'] if p['answer']]
+            passage = np.random.choice(D['passages'])['passage']
+            passage = re.sub(u' |、|；|，', ',', passage)
+            final_answer = ''
+            for answer in answers:
+                if all([
+                    a in passage[:max_p_len - 2] for a in answer.split(' ')
+                ]):
+                    final_answer = answer.replace(' ', ',')
+                    break
+            a_token_ids, _ = tokenizer.encode(
+                final_answer, maxlen=max_a_len + 1
+            )
+            q_token_ids, _ = tokenizer.encode(question, maxlen=max_q_len + 1)
+            p_token_ids, _ = tokenizer.encode(passage, maxlen=max_p_len + 1)
+            token_ids = [tokenizer._token_start_id]
+            token_ids += ([tokenizer._token_mask_id] * max_a_len)
+            token_ids += [tokenizer._token_end_id]
+            token_ids += (q_token_ids[1:] + p_token_ids[1:])
+            segment_ids = [0] * len(token_ids)
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_a_token_ids.append(a_token_ids[1:])
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_a_token_ids = sequence_padding(
+                    batch_a_token_ids, max_a_len
+                )
+                yield [batch_token_ids, batch_segment_ids], batch_a_token_ids
+                batch_token_ids, batch_segment_ids, batch_a_token_ids = [], [], []
+
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    with_mlm=True,
+    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+)
+output = Lambda(lambda x: x[:, 1:max_a_len + 1])(model.output)
+model = Model(model.input, output)
+model.summary()
+
+
+def masked_cross_entropy(y_true, y_pred):
+    """交叉熵作为loss，并mask掉padding部分的预测
+    """
+    y_true = K.reshape(y_true, [K.shape(y_true)[0], -1])
+    y_mask = K.cast(K.not_equal(y_true, 0), K.floatx())
+    cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
+    cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)
+    return cross_entropy
+
+
+model.compile(loss=masked_cross_entropy, optimizer=Adam(1e-5))
+
+
+def get_ngram_set(x, n):
+    """生成ngram合集，返回结果格式是:
+    {(n-1)-gram: set([n-gram的第n个字集合])}
+    """
+    result = {}
+    for i in range(len(x) - n + 1):
+        k = tuple(x[i:i + n])
+        if k[:-1] not in result:
+            result[k[:-1]] = set()
+        result[k[:-1]].add(k[-1])
+    return result
+
+
+def gen_answer(question, passages):
+    """由于是MLM模型，所以可以直接argmax解码。
+    """
+    all_p_token_ids, token_ids, segment_ids = [], [], []
+    for passage in passages:
+        passage = re.sub(u' |、|；|，', ',', passage)
+        p_token_ids, _ = tokenizer.encode(passage, maxlen=max_p_len + 1)
+        q_token_ids, _ = tokenizer.encode(question, maxlen=max_q_len + 1)
+        all_p_token_ids.append(p_token_ids[1:])
+        token_ids.append([tokenizer._token_start_id])
+        token_ids[-1] += ([tokenizer._token_mask_id] * max_a_len)
+        token_ids[-1] += [tokenizer._token_end_id]
+        token_ids[-1] += (q_token_ids[1:] + p_token_ids[1:])
+        segment_ids.append([0] * len(token_ids[-1]))
+    token_ids = sequence_padding(token_ids)
+    segment_ids = sequence_padding(segment_ids)
+    probas = model.predict([token_ids, segment_ids])
+    results = {}
+    for t, p in zip(all_p_token_ids, probas):
+        a, score = tuple(), 0.
+        for i in range(max_a_len):
+            idxs = list(get_ngram_set(t, i + 1)[a])
+            if tokenizer._token_end_id not in idxs:
+                idxs.append(tokenizer._token_end_id)
+            # pi是将passage以外的token的概率置零
+            pi = np.zeros_like(p[i])
+            pi[idxs] = p[i, idxs]
+            a = a + (pi.argmax(),)
+            score += pi.max()
+            if a[-1] == tokenizer._token_end_id:
+                break
+        score = score / (i + 1)
+        a = tokenizer.decode(a)
+        if a:
+            results[a] = results.get(a, []) + [score]
+    results = {
+        k: (np.array(v)**2).sum() / (sum(v) + 1)
+        for k, v in results.items()
+    }
+    return results
+
+
+def max_in_dict(d):
+    if d:
+        return sorted(d.items(), key=lambda s: -s[1])[0][0]
+
+
+def predict_to_file(data, filename):
+    """将预测结果输出到文件，方便评估
+    """
+    with open(filename, 'w', encoding='utf-8') as f:
+        for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)):
+            q_text = d['question']
+            p_texts = [p['passage'] for p in d['passages']]
+            a = gen_answer(q_text, p_texts)
+            a = max_in_dict(a)
+            if a:
+                s = u'%s\t%s\n' % (d['id'], a)
+            else:
+                s = u'%s\t\n' % (d['id'])
+            f.write(s)
+            f.flush()
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.lowest = 1e10
+
+    def on_epoch_end(self, epoch, logs=None):
+        # 保存最优
+        if logs['loss'] <= self.lowest:
+            self.lowest = logs['loss']
+            model.save_weights('./best_model.weights')
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(train_data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_reading_comprehension_by_seq2seq.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_reading_comprehension_by_seq2seq.py
new file mode 100644
index 0000000000000000000000000000000000000000..69a12630ce2135f3be10c36818777703315a0303
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_reading_comprehension_by_seq2seq.py
@@ -0,0 +1,288 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 用seq2seq的方式做阅读理解任务
+# 数据集和评测同 https://github.com/bojone/dgcnn_for_reading_comprehension
+# 8个epoch后在valid上能达到约0.77的分数
+# (Accuracy=0.7259005836184343  F1=0.813860036706151    Final=0.7698803101622926)
+
+import json, os, re
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, open
+from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
+from keras.models import Model
+from tqdm import tqdm
+
+max_p_len = 256
+max_q_len = 64
+max_a_len = 32
+max_qa_len = max_q_len + max_a_len
+batch_size = 32
+epochs = 8
+
+# bert配置
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+# 标注数据
+webqa_data = json.load(open('/root/qa_datasets/WebQA.json'))
+sogou_data = json.load(open('/root/qa_datasets/SogouQA.json'))
+
+# 保存一个随机序（供划分valid用）
+if not os.path.exists('../random_order.json'):
+    random_order = list(range(len(sogou_data)))
+    np.random.shuffle(random_order)
+    json.dump(random_order, open('../random_order.json', 'w'), indent=4)
+else:
+    random_order = json.load(open('../random_order.json'))
+
+# 划分valid
+train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0]
+valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0]
+train_data.extend(train_data)
+train_data.extend(webqa_data)  # 将SogouQA和WebQA按2:1的比例混合
+
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        """单条样本格式：[CLS]篇章[SEP]问题[SEP]答案[SEP]
+        """
+        batch_token_ids, batch_segment_ids = [], []
+        for is_end, D in self.sample(random):
+            question = D['question']
+            answers = [p['answer'] for p in D['passages'] if p['answer']]
+            passage = np.random.choice(D['passages'])['passage']
+            passage = re.sub(u' |、|；|，', ',', passage)
+            final_answer = ''
+            for answer in answers:
+                if all([
+                    a in passage[:max_p_len - 2] for a in answer.split(' ')
+                ]):
+                    final_answer = answer.replace(' ', ',')
+                    break
+            qa_token_ids, qa_segment_ids = tokenizer.encode(
+                question, final_answer, maxlen=max_qa_len + 1
+            )
+            p_token_ids, p_segment_ids = tokenizer.encode(
+                passage, maxlen=max_p_len
+            )
+            token_ids = p_token_ids + qa_token_ids[1:]
+            segment_ids = p_segment_ids + qa_segment_ids[1:]
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                yield [batch_token_ids, batch_segment_ids], None
+                batch_token_ids, batch_segment_ids = [], []
+
+
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉输入部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_mask, y_pred = inputs
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    application='unilm',
+    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+)
+
+output = CrossEntropy(2)(model.inputs + model.outputs)
+
+model = Model(model.inputs, output)
+model.compile(optimizer=Adam(1e-5))
+model.summary()
+
+
+class ReadingComprehension(AutoRegressiveDecoder):
+    """beam search解码来生成答案
+    passages为多篇章组成的list，从多篇文章中自动决策出最优的答案，
+    如果没答案，则返回空字符串。
+    mode是extractive时，按照抽取式执行，即答案必须是原篇章的一个片段。
+    """
+    def __init__(self, mode='extractive', **kwargs):
+        super(ReadingComprehension, self).__init__(**kwargs)
+        self.mode = mode
+
+    def get_ngram_set(self, x, n):
+        """生成ngram合集，返回结果格式是:
+        {(n-1)-gram: set([n-gram的第n个字集合])}
+        """
+        result = {}
+        for i in range(len(x) - n + 1):
+            k = tuple(x[i:i + n])
+            if k[:-1] not in result:
+                result[k[:-1]] = set()
+            result[k[:-1]].add(k[-1])
+        return result
+
+    @AutoRegressiveDecoder.wraps(default_rtype='probas', use_states=True)
+    def predict(self, inputs, output_ids, states):
+        inputs = [i for i in inputs if i[0, 0] > -1]  # 过滤掉无答案篇章
+        topk = len(inputs[0])
+        all_token_ids, all_segment_ids = [], []
+        for token_ids in inputs:  # inputs里每个元素都代表一个篇章
+            token_ids = np.concatenate([token_ids, output_ids], 1)
+            segment_ids = np.zeros_like(token_ids)
+            if states > 0:
+                segment_ids[:, -output_ids.shape[1]:] = 1
+            all_token_ids.extend(token_ids)
+            all_segment_ids.extend(segment_ids)
+        padded_all_token_ids = sequence_padding(all_token_ids)
+        padded_all_segment_ids = sequence_padding(all_segment_ids)
+        probas = model.predict([padded_all_token_ids, padded_all_segment_ids])
+        probas = [
+            probas[i, len(ids) - 1] for i, ids in enumerate(all_token_ids)
+        ]
+        probas = np.array(probas).reshape((len(inputs), topk, -1))
+        if states == 0:
+            # 这一步主要是排除没有答案的篇章
+            # 如果一开始最大值就为end_id，那说明该篇章没有答案
+            argmax = probas[:, 0].argmax(axis=1)
+            available_idxs = np.where(argmax != self.end_id)[0]
+            if len(available_idxs) == 0:
+                scores = np.zeros_like(probas[0])
+                scores[:, self.end_id] = 1
+                return scores, states + 1
+            else:
+                for i in np.where(argmax == self.end_id)[0]:
+                    inputs[i][:, 0] = -1  # 无答案篇章首位标记为-1
+                probas = probas[available_idxs]
+                inputs = [i for i in inputs if i[0, 0] > -1]  # 过滤掉无答案篇章
+        if self.mode == 'extractive':
+            # 如果是抽取式，那么答案必须是篇章的一个片段
+            # 那么将非篇章片段的概率值全部置0
+            new_probas = np.zeros_like(probas)
+            ngrams = {}
+            for token_ids in inputs:
+                token_ids = token_ids[0]
+                sep_idx = np.where(token_ids == tokenizer._token_end_id)[0][0]
+                p_token_ids = token_ids[1:sep_idx]
+                for k, v in self.get_ngram_set(p_token_ids, states + 1).items():
+                    ngrams[k] = ngrams.get(k, set()) | v
+            for i, ids in enumerate(output_ids):
+                available_idxs = ngrams.get(tuple(ids), set())
+                available_idxs.add(tokenizer._token_end_id)
+                available_idxs = list(available_idxs)
+                new_probas[:, i, available_idxs] = probas[:, i, available_idxs]
+            probas = new_probas
+        return (probas**2).sum(0) / (probas.sum(0) + 1), states + 1  # 某种平均投票方式
+
+    def answer(self, question, passages, topk=1):
+        token_ids = []
+        for passage in passages:
+            passage = re.sub(u' |、|；|，', ',', passage)
+            p_token_ids = tokenizer.encode(passage, maxlen=max_p_len)[0]
+            q_token_ids = tokenizer.encode(question, maxlen=max_q_len + 1)[0]
+            token_ids.append(p_token_ids + q_token_ids[1:])
+        output_ids = self.beam_search(
+            token_ids, topk=topk, states=0
+        )  # 基于beam search
+        return tokenizer.decode(output_ids)
+
+
+reader = ReadingComprehension(
+    start_id=None,
+    end_id=tokenizer._token_end_id,
+    maxlen=max_a_len,
+    mode='extractive'
+)
+
+
+def predict_to_file(data, filename, topk=1):
+    """将预测结果输出到文件，方便评估
+    """
+    with open(filename, 'w', encoding='utf-8') as f:
+        for d in tqdm(iter(data), desc=u'正在预测(共%s条样本)' % len(data)):
+            q_text = d['question']
+            p_texts = [p['passage'] for p in d['passages']]
+            a = reader.answer(q_text, p_texts, topk)
+            if a:
+                s = u'%s\t%s\n' % (d['id'], a)
+            else:
+                s = u'%s\t\n' % (d['id'])
+            f.write(s)
+            f.flush()
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.lowest = 1e10
+
+    def on_epoch_end(self, epoch, logs=None):
+        # 保存最优
+        if logs['loss'] <= self.lowest:
+            self.lowest = logs['loss']
+            model.save_weights('./best_model.weights')
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(train_data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_relation_extraction.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_relation_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..936419b7021a438937e97714fe64bd34384b9f42
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_relation_extraction.py
@@ -0,0 +1,376 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding:utf-8 -*-
+# 三元组抽取任务，基于“半指针-半标注”结构
+# 文章介绍：https://kexue.fm/archives/7161
+# 数据集：http://ai.baidu.com/broad/download?dataset=sked
+# 最优f1=0.82198
+# 换用RoBERTa Large可以达到f1=0.829+
+# 说明：由于使用了EMA，需要跑足够多的步数(5000步以上）才生效，如果
+#      你的数据总量比较少，那么请务必跑足够多的epoch数，或者去掉EMA。
+
+import json
+import numpy as np
+from bert4keras.backend import keras, K, batch_gather
+from bert4keras.layers import Loss
+from bert4keras.layers import LayerNormalization
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam, extend_with_exponential_moving_average
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open, to_array
+from keras.layers import Input, Dense, Lambda, Reshape
+from keras.models import Model
+from tqdm import tqdm
+
+maxlen = 128
+batch_size = 64
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：{'text': text, 'spo_list': [(s, p, o)]}
+    """
+    D = []
+    with open(filename, encoding='utf-8') as f:
+        for l in f:
+            l = json.loads(l)
+            D.append({
+                'text': l['text'],
+                'spo_list': [(spo['subject'], spo['predicate'], spo['object'])
+                             for spo in l['spo_list']]
+            })
+    return D
+
+
+# 加载数据集
+train_data = load_data('/root/kg/datasets/train_data.json')
+valid_data = load_data('/root/kg/datasets/dev_data.json')
+predicate2id, id2predicate = {}, {}
+
+with open('/root/kg/datasets/all_50_schemas') as f:
+    for l in f:
+        l = json.loads(l)
+        if l['predicate'] not in predicate2id:
+            id2predicate[len(predicate2id)] = l['predicate']
+            predicate2id[l['predicate']] = len(predicate2id)
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+
+def search(pattern, sequence):
+    """从sequence中寻找子串pattern
+    如果找到，返回第一个下标；否则返回-1。
+    """
+    n = len(pattern)
+    for i in range(len(sequence)):
+        if sequence[i:i + n] == pattern:
+            return i
+    return -1
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids = [], []
+        batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []
+        for is_end, d in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(d['text'], maxlen=maxlen)
+            # 整理三元组 {s: [(o, p)]}
+            spoes = {}
+            for s, p, o in d['spo_list']:
+                s = tokenizer.encode(s)[0][1:-1]
+                p = predicate2id[p]
+                o = tokenizer.encode(o)[0][1:-1]
+                s_idx = search(s, token_ids)
+                o_idx = search(o, token_ids)
+                if s_idx != -1 and o_idx != -1:
+                    s = (s_idx, s_idx + len(s) - 1)
+                    o = (o_idx, o_idx + len(o) - 1, p)
+                    if s not in spoes:
+                        spoes[s] = []
+                    spoes[s].append(o)
+            if spoes:
+                # subject标签
+                subject_labels = np.zeros((len(token_ids), 2))
+                for s in spoes:
+                    subject_labels[s[0], 0] = 1
+                    subject_labels[s[1], 1] = 1
+                # 随机选一个subject（这里没有实现错误！这就是想要的效果！！）
+                start, end = np.array(list(spoes.keys())).T
+                start = np.random.choice(start)
+                end = np.random.choice(end[end >= start])
+                subject_ids = (start, end)
+                # 对应的object标签
+                object_labels = np.zeros((len(token_ids), len(predicate2id), 2))
+                for o in spoes.get(subject_ids, []):
+                    object_labels[o[0], o[2], 0] = 1
+                    object_labels[o[1], o[2], 1] = 1
+                # 构建batch
+                batch_token_ids.append(token_ids)
+                batch_segment_ids.append(segment_ids)
+                batch_subject_labels.append(subject_labels)
+                batch_subject_ids.append(subject_ids)
+                batch_object_labels.append(object_labels)
+                if len(batch_token_ids) == self.batch_size or is_end:
+                    batch_token_ids = sequence_padding(batch_token_ids)
+                    batch_segment_ids = sequence_padding(batch_segment_ids)
+                    batch_subject_labels = sequence_padding(
+                        batch_subject_labels
+                    )
+                    batch_subject_ids = np.array(batch_subject_ids)
+                    batch_object_labels = sequence_padding(batch_object_labels)
+                    yield [
+                        batch_token_ids, batch_segment_ids,
+                        batch_subject_labels, batch_subject_ids,
+                        batch_object_labels
+                    ], None
+                    batch_token_ids, batch_segment_ids = [], []
+                    batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []
+
+
+def extract_subject(inputs):
+    """根据subject_ids从output中取出subject的向量表征
+    """
+    output, subject_ids = inputs
+    start = batch_gather(output, subject_ids[:, :1])
+    end = batch_gather(output, subject_ids[:, 1:])
+    subject = K.concatenate([start, end], 2)
+    return subject[:, 0]
+
+
+# 补充输入
+subject_labels = Input(shape=(None, 2), name='Subject-Labels')
+subject_ids = Input(shape=(2,), name='Subject-Ids')
+object_labels = Input(shape=(None, len(predicate2id), 2), name='Object-Labels')
+
+# 加载预训练模型
+bert = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    return_keras_model=False,
+)
+
+# 预测subject
+output = Dense(
+    units=2, activation='sigmoid', kernel_initializer=bert.initializer
+)(bert.model.output)
+subject_preds = Lambda(lambda x: x**2)(output)
+
+subject_model = Model(bert.model.inputs, subject_preds)
+
+# 传入subject，预测object
+# 通过Conditional Layer Normalization将subject融入到object的预测中
+output = bert.model.layers[-2].get_output_at(-1)  # 自己想为什么是-2而不是-1
+subject = Lambda(extract_subject)([output, subject_ids])
+output = LayerNormalization(conditional=True)([output, subject])
+output = Dense(
+    units=len(predicate2id) * 2,
+    activation='sigmoid',
+    kernel_initializer=bert.initializer
+)(output)
+output = Lambda(lambda x: x**4)(output)
+object_preds = Reshape((-1, len(predicate2id), 2))(output)
+
+object_model = Model(bert.model.inputs + [subject_ids], object_preds)
+
+
+class TotalLoss(Loss):
+    """subject_loss与object_loss之和，都是二分类交叉熵
+    """
+    def compute_loss(self, inputs, mask=None):
+        subject_labels, object_labels = inputs[:2]
+        subject_preds, object_preds, _ = inputs[2:]
+        if mask[4] is None:
+            mask = 1.0
+        else:
+            mask = K.cast(mask[4], K.floatx())
+        # subject部分loss
+        subject_loss = K.binary_crossentropy(subject_labels, subject_preds)
+        subject_loss = K.mean(subject_loss, 2)
+        subject_loss = K.sum(subject_loss * mask) / K.sum(mask)
+        # object部分loss
+        object_loss = K.binary_crossentropy(object_labels, object_preds)
+        object_loss = K.sum(K.mean(object_loss, 3), 2)
+        object_loss = K.sum(object_loss * mask) / K.sum(mask)
+        # 总的loss
+        return subject_loss + object_loss
+
+
+subject_preds, object_preds = TotalLoss([2, 3])([
+    subject_labels, object_labels, subject_preds, object_preds,
+    bert.model.output
+])
+
+# 训练模型
+train_model = Model(
+    bert.model.inputs + [subject_labels, subject_ids, object_labels],
+    [subject_preds, object_preds]
+)
+
+AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA')
+optimizer = AdamEMA(learning_rate=1e-5)
+train_model.compile(optimizer=optimizer)
+
+
+def extract_spoes(text):
+    """抽取输入text所包含的三元组
+    """
+    tokens = tokenizer.tokenize(text, maxlen=maxlen)
+    mapping = tokenizer.rematch(text, tokens)
+    token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+    token_ids, segment_ids = to_array([token_ids], [segment_ids])
+    # 抽取subject
+    subject_preds = subject_model.predict([token_ids, segment_ids])
+    subject_preds[:, [0, -1]] *= 0
+    start = np.where(subject_preds[0, :, 0] > 0.6)[0]
+    end = np.where(subject_preds[0, :, 1] > 0.5)[0]
+    subjects = []
+    for i in start:
+        j = end[end >= i]
+        if len(j) > 0:
+            j = j[0]
+            subjects.append((i, j))
+    if subjects:
+        spoes = []
+        token_ids = np.repeat(token_ids, len(subjects), 0)
+        segment_ids = np.repeat(segment_ids, len(subjects), 0)
+        subjects = np.array(subjects)
+        # 传入subject，抽取object和predicate
+        object_preds = object_model.predict([token_ids, segment_ids, subjects])
+        object_preds[:, [0, -1]] *= 0
+        for subject, object_pred in zip(subjects, object_preds):
+            start = np.where(object_pred[:, :, 0] > 0.6)
+            end = np.where(object_pred[:, :, 1] > 0.5)
+            for _start, predicate1 in zip(*start):
+                for _end, predicate2 in zip(*end):
+                    if _start <= _end and predicate1 == predicate2:
+                        spoes.append(
+                            ((mapping[subject[0]][0],
+                              mapping[subject[1]][-1]), predicate1,
+                             (mapping[_start][0], mapping[_end][-1]))
+                        )
+                        break
+        return [(text[s[0]:s[1] + 1], id2predicate[p], text[o[0]:o[1] + 1])
+                for s, p, o, in spoes]
+    else:
+        return []
+
+
+class SPO(tuple):
+    """用来存三元组的类
+    表现跟tuple基本一致，只是重写了 __hash__ 和 __eq__ 方法，
+    使得在判断两个三元组是否等价时容错性更好。
+    """
+    def __init__(self, spo):
+        self.spox = (
+            tuple(tokenizer.tokenize(spo[0])),
+            spo[1],
+            tuple(tokenizer.tokenize(spo[2])),
+        )
+
+    def __hash__(self):
+        return self.spox.__hash__()
+
+    def __eq__(self, spo):
+        return self.spox == spo.spox
+
+
+def evaluate(data):
+    """评估函数，计算f1、precision、recall
+    """
+    X, Y, Z = 1e-10, 1e-10, 1e-10
+    f = open('dev_pred.json', 'w', encoding='utf-8')
+    pbar = tqdm()
+    for d in data:
+        R = set([SPO(spo) for spo in extract_spoes(d['text'])])
+        T = set([SPO(spo) for spo in d['spo_list']])
+        X += len(R & T)
+        Y += len(R)
+        Z += len(T)
+        f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
+        pbar.update()
+        pbar.set_description(
+            'f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall)
+        )
+        s = json.dumps({
+            'text': d['text'],
+            'spo_list': list(T),
+            'spo_list_pred': list(R),
+            'new': list(R - T),
+            'lack': list(T - R),
+        },
+                       ensure_ascii=False,
+                       indent=4)
+        f.write(s + '\n')
+    pbar.close()
+    f.close()
+    return f1, precision, recall
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_f1 = 0.
+
+    def on_epoch_end(self, epoch, logs=None):
+        optimizer.apply_ema_weights()
+        f1, precision, recall = evaluate(valid_data)
+        if f1 >= self.best_val_f1:
+            self.best_val_f1 = f1
+            train_model.save_weights('best_model.weights')
+        optimizer.reset_old_weights()
+        print(
+            'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
+            (f1, precision, recall, self.best_val_f1)
+        )
+
+
+if __name__ == '__main__':
+
+    train_generator = data_generator(train_data, batch_size)
+    evaluator = Evaluator()
+
+    train_model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=20,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    train_model.load_weights('best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_relation_extraction_gplinker.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_relation_extraction_gplinker.py
new file mode 100644
index 0000000000000000000000000000000000000000..abd4091fd728011a5d8433e596b6629f853495da
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_relation_extraction_gplinker.py
@@ -0,0 +1,310 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding:utf-8 -*-
+# 三元组抽取任务，基于GlobalPointer的仿TPLinker设计
+# 文章介绍：https://kexue.fm/archives/8888
+# 数据集：http://ai.baidu.com/broad/download?dataset=sked
+# 最优f1=0.827
+# 说明：由于使用了EMA，需要跑足够多的步数(5000步以上）才生效，如果
+#      你的数据总量比较少，那么请务必跑足够多的epoch数，或者去掉EMA。
+
+import json
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.backend import sparse_multilabel_categorical_crossentropy
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.layers import EfficientGlobalPointer as GlobalPointer
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam, extend_with_exponential_moving_average
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open, to_array
+from tqdm import tqdm
+
+maxlen = 128
+batch_size = 64
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：{'text': text, 'spo_list': [(s, p, o)]}
+    """
+    D = []
+    with open(filename, encoding='utf-8') as f:
+        for l in f:
+            l = json.loads(l)
+            D.append({
+                'text': l['text'],
+                'spo_list': [(spo['subject'], spo['predicate'], spo['object'])
+                             for spo in l['spo_list']]
+            })
+    return D
+
+
+# 加载数据集
+train_data = load_data('/root/kg/datasets/train_data.json')
+valid_data = load_data('/root/kg/datasets/dev_data.json')
+predicate2id, id2predicate = {}, {}
+
+with open('/root/kg/datasets/all_50_schemas') as f:
+    for l in f:
+        l = json.loads(l)
+        if l['predicate'] not in predicate2id:
+            id2predicate[len(predicate2id)] = l['predicate']
+            predicate2id[l['predicate']] = len(predicate2id)
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+
+def search(pattern, sequence):
+    """从sequence中寻找子串pattern
+    如果找到，返回第一个下标；否则返回-1。
+    """
+    n = len(pattern)
+    for i in range(len(sequence)):
+        if sequence[i:i + n] == pattern:
+            return i
+    return -1
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids = [], []
+        batch_entity_labels, batch_head_labels, batch_tail_labels = [], [], []
+        for is_end, d in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(d['text'], maxlen=maxlen)
+            # 整理三元组 {(s, o, p)}
+            spoes = set()
+            for s, p, o in d['spo_list']:
+                s = tokenizer.encode(s)[0][1:-1]
+                p = predicate2id[p]
+                o = tokenizer.encode(o)[0][1:-1]
+                sh = search(s, token_ids)
+                oh = search(o, token_ids)
+                if sh != -1 and oh != -1:
+                    spoes.add((sh, sh + len(s) - 1, p, oh, oh + len(o) - 1))
+            # 构建标签
+            entity_labels = [set() for _ in range(2)]
+            head_labels = [set() for _ in range(len(predicate2id))]
+            tail_labels = [set() for _ in range(len(predicate2id))]
+            for sh, st, p, oh, ot in spoes:
+                entity_labels[0].add((sh, st))
+                entity_labels[1].add((oh, ot))
+                head_labels[p].add((sh, oh))
+                tail_labels[p].add((st, ot))
+            for label in entity_labels + head_labels + tail_labels:
+                if not label:  # 至少要有一个标签
+                    label.add((0, 0))  # 如果没有则用0填充
+            entity_labels = sequence_padding([list(l) for l in entity_labels])
+            head_labels = sequence_padding([list(l) for l in head_labels])
+            tail_labels = sequence_padding([list(l) for l in tail_labels])
+            # 构建batch
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_entity_labels.append(entity_labels)
+            batch_head_labels.append(head_labels)
+            batch_tail_labels.append(tail_labels)
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_entity_labels = sequence_padding(
+                    batch_entity_labels, seq_dims=2
+                )
+                batch_head_labels = sequence_padding(
+                    batch_head_labels, seq_dims=2
+                )
+                batch_tail_labels = sequence_padding(
+                    batch_tail_labels, seq_dims=2
+                )
+                yield [batch_token_ids, batch_segment_ids], [
+                    batch_entity_labels, batch_head_labels, batch_tail_labels
+                ]
+                batch_token_ids, batch_segment_ids = [], []
+                batch_entity_labels, batch_head_labels, batch_tail_labels = [], [], []
+
+
+def globalpointer_crossentropy(y_true, y_pred):
+    """给GlobalPointer设计的交叉熵
+    """
+    shape = K.shape(y_pred)
+    y_true = y_true[..., 0] * K.cast(shape[2], K.floatx()) + y_true[..., 1]
+    y_pred = K.reshape(y_pred, (shape[0], -1, K.prod(shape[2:])))
+    loss = sparse_multilabel_categorical_crossentropy(y_true, y_pred, True)
+    return K.mean(K.sum(loss, axis=1))
+
+
+# 加载预训练模型
+base = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    return_keras_model=False
+)
+
+# 预测结果
+entity_output = GlobalPointer(heads=2, head_size=64)(base.model.output)
+head_output = GlobalPointer(
+    heads=len(predicate2id), head_size=64, RoPE=False, tril_mask=False
+)(base.model.output)
+tail_output = GlobalPointer(
+    heads=len(predicate2id), head_size=64, RoPE=False, tril_mask=False
+)(base.model.output)
+outputs = [entity_output, head_output, tail_output]
+
+# 构建模型
+AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA')
+optimizer = AdamEMA(learning_rate=1e-5)
+model = keras.models.Model(base.model.inputs, outputs)
+model.compile(loss=globalpointer_crossentropy, optimizer=optimizer)
+model.summary()
+
+
+def extract_spoes(text, threshold=0):
+    """抽取输入text所包含的三元组
+    """
+    tokens = tokenizer.tokenize(text, maxlen=maxlen)
+    mapping = tokenizer.rematch(text, tokens)
+    token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+    token_ids, segment_ids = to_array([token_ids], [segment_ids])
+    outputs = model.predict([token_ids, segment_ids])
+    outputs = [o[0] for o in outputs]
+    # 抽取subject和object
+    subjects, objects = set(), set()
+    outputs[0][:, [0, -1]] -= np.inf
+    outputs[0][:, :, [0, -1]] -= np.inf
+    for l, h, t in zip(*np.where(outputs[0] > threshold)):
+        if l == 0:
+            subjects.add((h, t))
+        else:
+            objects.add((h, t))
+    # 识别对应的predicate
+    spoes = set()
+    for sh, st in subjects:
+        for oh, ot in objects:
+            p1s = np.where(outputs[1][:, sh, oh] > threshold)[0]
+            p2s = np.where(outputs[2][:, st, ot] > threshold)[0]
+            ps = set(p1s) & set(p2s)
+            for p in ps:
+                spoes.add((
+                    text[mapping[sh][0]:mapping[st][-1] + 1], id2predicate[p],
+                    text[mapping[oh][0]:mapping[ot][-1] + 1]
+                ))
+    return list(spoes)
+
+
+class SPO(tuple):
+    """用来存三元组的类
+    表现跟tuple基本一致，只是重写了 __hash__ 和 __eq__ 方法，
+    使得在判断两个三元组是否等价时容错性更好。
+    """
+    def __init__(self, spo):
+        self.spox = (
+            tuple(tokenizer.tokenize(spo[0])),
+            spo[1],
+            tuple(tokenizer.tokenize(spo[2])),
+        )
+
+    def __hash__(self):
+        return self.spox.__hash__()
+
+    def __eq__(self, spo):
+        return self.spox == spo.spox
+
+
+def evaluate(data):
+    """评估函数，计算f1、precision、recall
+    """
+    X, Y, Z = 1e-10, 1e-10, 1e-10
+    f = open('dev_pred.json', 'w', encoding='utf-8')
+    pbar = tqdm()
+    for d in data:
+        R = set([SPO(spo) for spo in extract_spoes(d['text'])])
+        T = set([SPO(spo) for spo in d['spo_list']])
+        X += len(R & T)
+        Y += len(R)
+        Z += len(T)
+        f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
+        pbar.update()
+        pbar.set_description(
+            'f1: %.5f, precision: %.5f, recall: %.5f' % (f1, precision, recall)
+        )
+        s = json.dumps({
+            'text': d['text'],
+            'spo_list': list(T),
+            'spo_list_pred': list(R),
+            'new': list(R - T),
+            'lack': list(T - R),
+        },
+                       ensure_ascii=False,
+                       indent=4)
+        f.write(s + '\n')
+    pbar.close()
+    f.close()
+    return f1, precision, recall
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_f1 = 0.
+
+    def on_epoch_end(self, epoch, logs=None):
+        optimizer.apply_ema_weights()
+        f1, precision, recall = evaluate(valid_data)
+        if f1 >= self.best_val_f1:
+            self.best_val_f1 = f1
+            model.save_weights('best_model.weights')
+        optimizer.reset_old_weights()
+        print(
+            'f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
+            (f1, precision, recall, self.best_val_f1)
+        )
+
+
+if __name__ == '__main__':
+
+    train_generator = data_generator(train_data, batch_size)
+    evaluator = Evaluator()
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=20,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentence_similarity_lcqmc.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentence_similarity_lcqmc.py
new file mode 100644
index 0000000000000000000000000000000000000000..50653ec79f04035fe372634ae26342dddebdda6f
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentence_similarity_lcqmc.py
@@ -0,0 +1,209 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding:utf-8 -*-
+# 句子对分类任务，LCQMC数据集
+# val_acc: 0.887071, test_acc: 0.870320
+
+from npu_bridge.npu_init import *
+import numpy as np
+from examples.bertkeras import keras, set_gelu, K
+from examples.bertkeras.tokenizers import Tokenizer
+from examples.bertkeras import build_transformer_model
+from examples.bertkeras import Adam
+from examples.bertkeras import sequence_padding, DataGenerator
+from examples.bertkeras import open
+from examples.bertkeras import Dropout, Dense
+#from npu_bridge.estimator.npu import npu_convert_dropout
+
+set_gelu('tanh')  # 切换gelu版本
+
+
+maxlen = 128
+batch_size = 64
+config_path = '/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/chinese_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：(文本1, 文本2, 标签id)
+    """
+    D = []
+    with open(filename, encoding='utf-8') as f:
+        for l in f:
+            text1, text2, label = l.strip().split('\t')
+            D.append((text1, text2, int(label)))
+    return D
+
+
+# 加载数据集
+train_data = load_data('datasets/lcqmc/lcqmc.train.data')
+valid_data = load_data('datasets/lcqmc/lcqmc.valid.data')
+test_data = load_data('datasets/lcqmc/lcqmc.test.data')
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+        for is_end, (text1, text2, label) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(
+                text1, text2, 128)
+
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_labels.append([label])
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_labels = sequence_padding(batch_labels)
+                yield [batch_token_ids, batch_segment_ids], batch_labels
+                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+
+
+# 加载预训练模型
+bert = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    with_pool=True,
+    return_keras_model=False,
+)
+def output(x):
+    return npu_ops.dropout(x,keep_prob=0.9)
+
+#output = Dropout(rate=0.1)(bert.model.output)
+
+#output = npu_ops.dropout(bert.model.output,keep_prob=1.0 - 0.1)
+output = keras.layers.Lambda(output)(bert.model.output)
+
+output = Dense(
+    units=2, activation='softmax', kernel_initializer=bert.initializer
+)(output)
+
+model = keras.models.Model(bert.model.input, output)
+
+model.summary()
+
+model.compile(
+    loss='sparse_categorical_crossentropy',
+    optimizer=Adam(2e-5),  # 用足够小的学习率
+    # optimizer=PiecewiseLinearLearningRate(Adam(5e-5), {10000: 1, 30000: 0.1}),
+    metrics=['accuracy'],
+)
+
+# 转换数据集
+train_generator = data_generator(train_data, batch_size)
+valid_generator = data_generator(valid_data, batch_size)
+test_generator = data_generator(test_data, batch_size)
+
+
+def evaluate(data):
+    total, right = 0., 0.
+    for x_true, y_true in data:
+        y_pred = model.predict(x_true).argmax(axis=1)
+        y_true = y_true[:, 0]
+        total += len(y_true)
+        right += (y_true == y_pred).sum()
+    return right / total
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+
+    def on_epoch_end(self, epoch, logs=None):
+        val_acc = evaluate(valid_generator)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            model.save_weights('best_model.weights')
+        test_acc = evaluate(test_generator)
+        print(
+            u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
+            (val_acc, self.best_val_acc, test_acc)
+        )
+
+import time
+class TimeHistory(keras.callbacks.Callback):
+    def __init__(self, logs={}):
+        self.logs=[]
+    def on_batch_begin(self,batch,logs={}):
+        self.start = time.time()
+    def on_batch_end(self,batch,logs={}):
+        self.logs.append(time.time() - self.start)
+
+time_callback = TimeHistory()
+
+if __name__ == '__main__':
+    #npu_keras_sess = set_keras_session_npu_config()
+    config_proto = tf.ConfigProto()
+    custom_op = config_proto.graph_options.rewrite_options.custom_optimizers.add()
+    custom_op.name = 'NpuOptimizer'
+    custom_op.parameter_map["auto_tune_mode"].s = tf.compat.as_bytes("RL,GA")
+    npu_keras_sess = set_keras_session_npu_config(config=config_proto)
+
+    sess_config = tf.ConfigProto()
+    custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add()
+    custom_op.name = "NpuOptimizer"
+    custom_op.parameter_map["dynamic_input"].b = True
+    custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes("lazy_recompile")
+    custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision")
+    custom_op.parameter_map["use_off_line"].b = True 
+    #custom_op.parameter_map["dynamic_graph_execute_mode"].s = tf.compat.as_bytes("dynamic_execute")
+
+    sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
+    sess_config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF
+    sess = tf.Session(config=sess_config)
+    K.set_session(sess)
+
+    evaluator = Evaluator()
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=20,
+        callbacks=[evaluator,time_callback]
+    )
+
+    model.load_weights('best_model.weights')
+    print(u'final test acc: %05f\n' % (evaluate(test_generator)))
+    #close_session(npu_keras_sess)
+    print("====================",time_callback.logs)
+    #sess.close()
+else:
+
+    model.load_weights('best_model.weights')
+
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentiment_albert.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentiment_albert.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9a274322c93d28b673ee94ca0076540e51a0058
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentiment_albert.py
@@ -0,0 +1,173 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding:utf-8 -*-
+# 情感分析例子，加载albert_zh权重(https://github.com/brightmart/albert_zh)
+
+import numpy as np
+from bert4keras.backend import keras, set_gelu
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open
+from keras.layers import Lambda, Dense
+
+set_gelu('tanh')  # 切换gelu版本
+
+num_classes = 2
+maxlen = 128
+batch_size = 32
+config_path = '/root/kg/bert/albert_small_zh_google/albert_config.json'
+checkpoint_path = '/root/kg/bert/albert_small_zh_google/albert_model.ckpt'
+dict_path = '/root/kg/bert/albert_small_zh_google/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：(文本, 标签id)
+    """
+    D = []
+    with open(filename, encoding='utf-8') as f:
+        for l in f:
+            text, label = l.strip().split('\t')
+            D.append((text, int(label)))
+    return D
+
+
+# 加载数据集
+train_data = load_data('datasets/sentiment/sentiment.train.data')
+valid_data = load_data('datasets/sentiment/sentiment.valid.data')
+test_data = load_data('datasets/sentiment/sentiment.test.data')
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+        for is_end, (text, label) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_labels.append([label])
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_labels = sequence_padding(batch_labels)
+                yield [batch_token_ids, batch_segment_ids], batch_labels
+                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+
+
+# 加载预训练模型
+bert = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    model='albert',
+    return_keras_model=False,
+)
+
+output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
+output = Dense(
+    units=num_classes,
+    activation='softmax',
+    kernel_initializer=bert.initializer
+)(output)
+
+model = keras.models.Model(bert.model.input, output)
+model.summary()
+
+# 派生为带分段线性学习率的优化器。
+# 其中name参数可选，但最好填入，以区分不同的派生优化器。
+AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')
+
+model.compile(
+    loss='sparse_categorical_crossentropy',
+    # optimizer=Adam(1e-5),  # 用足够小的学习率
+    optimizer=AdamLR(learning_rate=1e-4, lr_schedule={
+        1000: 1,
+        2000: 0.1
+    }),
+    metrics=['accuracy'],
+)
+
+# 转换数据集
+train_generator = data_generator(train_data, batch_size)
+valid_generator = data_generator(valid_data, batch_size)
+test_generator = data_generator(test_data, batch_size)
+
+
+def evaluate(data):
+    total, right = 0., 0.
+    for x_true, y_true in data:
+        y_pred = model.predict(x_true).argmax(axis=1)
+        y_true = y_true[:, 0]
+        total += len(y_true)
+        right += (y_true == y_pred).sum()
+    return right / total
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+
+    def on_epoch_end(self, epoch, logs=None):
+        val_acc = evaluate(valid_generator)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            model.save_weights('best_model.weights')
+        test_acc = evaluate(test_generator)
+        print(
+            u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
+            (val_acc, self.best_val_acc, test_acc)
+        )
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=10,
+        callbacks=[evaluator]
+    )
+
+    model.load_weights('best_model.weights')
+    print(u'final test acc: %05f\n' % (evaluate(test_generator)))
+
+else:
+
+    model.load_weights('best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentiment_integrated_gradients.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentiment_integrated_gradients.py
new file mode 100644
index 0000000000000000000000000000000000000000..e46b17d9d3f21c4f888cc52fd738e772d4737072
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentiment_integrated_gradients.py
@@ -0,0 +1,103 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 通过积分梯度（Integrated Gradients）来给输入进行重要性排序
+# 接 task_sentiment_albert.py
+# 原始论文：https://arxiv.org/abs/1703.01365
+# 博客介绍：https://kexue.fm/archives/7533
+# 请读者务必先弄懂原理再看代码，下述代码仅是交互式演示代码，并非成品API
+
+from task_sentiment_albert import *
+from keras.layers import Layer, Input
+from bert4keras.backend import K, batch_gather
+from keras.models import Model
+from bert4keras.snippets import uniout
+
+
+class Gradient(Layer):
+    """获取梯度的层
+    """
+    def __init__(self, **kwargs):
+        super(Gradient, self).__init__(**kwargs)
+        self.supports_masking = True
+
+    def call(self, input):
+        input, output, label = input
+        output = batch_gather(output, label)
+        return K.gradients(output, [input])[0] * input
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[0]
+
+
+label_in = Input(shape=(1,))  # 指定标签
+input = model.get_layer('Embedding-Token').output
+output = model.output
+grads = Gradient()([input, output, label_in])
+grad_model = Model(model.inputs + [label_in], grads)
+
+# 获取原始embedding层
+embeddings = model.get_layer('Embedding-Token').embeddings
+values = K.eval(embeddings)
+
+text = u'这家店真黑心'
+text = u'图太乱了 有点看不懂重点  讲故事的时候很难让孩子集中'
+text = u'这是一本很好看的书'
+text = u'这是一本很糟糕的书'
+token_ids, segment_ids = tokenizer.encode(text)
+preds = model.predict([[token_ids], [segment_ids]])
+label = np.argmax(preds[0])
+
+pred_grads = []
+n = 20
+for i in range(n):
+    # nlp任务中参照背景通常直接选零向量，所以这里
+    # 让embedding层从零渐变到原始值，以实现路径变换。
+    alpha = 1.0 * i / (n - 1)
+    K.set_value(embeddings, alpha * values)
+    pred_grad = grad_model.predict([[token_ids], [segment_ids], [[label]]])[0]
+    pred_grads.append(pred_grad)
+
+# 然后求平均
+pred_grads = np.mean(pred_grads, 0)
+
+# 这时候我们得到形状为(seq_len, hidden_dim)的矩阵，我们要将它变换成(seq_len,)
+# 这时候有两种方案：1、直接求模长；2、取绝对值后再取最大。两者效果差不多。
+scores = np.sqrt((pred_grads**2).sum(axis=1))
+scores = (scores - scores.min()) / (scores.max() - scores.min())
+scores = scores.round(4)
+results = [(tokenizer.decode([t]), s) for t, s in zip(token_ids, scores)]
+print(results[1:-1])
+
+scores = np.abs(pred_grads).max(axis=1)
+scores = (scores - scores.min()) / (scores.max() - scores.min())
+scores = scores.round(4)
+results = [(tokenizer.decode([t]), s) for t, s in zip(token_ids, scores)]
+print(results[1:-1])
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentiment_virtual_adversarial_training.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentiment_virtual_adversarial_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2110a8d3dbd7954d04f7d2bb6b37a28410d8179
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sentiment_virtual_adversarial_training.py
@@ -0,0 +1,255 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding:utf-8 -*-
+# 通过虚拟对抗训练进行半监督学习
+# use_vat=True比use_vat=False约有1%的提升
+# 数据集：情感分析数据集
+# 博客：https://kexue.fm/archives/7466
+# 适用于Keras 2.3.1
+
+import json
+import numpy as np
+from bert4keras.backend import keras, search_layer, K
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.models import build_transformer_model
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open
+from keras.layers import Lambda, Dense
+from keras.utils import to_categorical
+from tqdm import tqdm
+
+# 配置信息
+num_classes = 2
+maxlen = 128
+batch_size = 32
+train_frac = 0.01  # 标注数据的比例
+use_vat = True  # 可以比较True/False的效果
+
+# BERT base
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：(文本, 标签id)
+    """
+    D = []
+    with open(filename, encoding='utf-8') as f:
+        for l in f:
+            text, label = l.strip().split('\t')
+            D.append((text, int(label)))
+    return D
+
+
+# 加载数据集
+train_data = load_data('datasets/sentiment/sentiment.train.data')
+valid_data = load_data('datasets/sentiment/sentiment.valid.data')
+test_data = load_data('datasets/sentiment/sentiment.test.data')
+
+# 模拟标注和非标注数据
+num_labeled = int(len(train_data) * train_frac)
+unlabeled_data = [(t, 0) for t, l in train_data[num_labeled:]]
+train_data = train_data[:num_labeled]
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+        for is_end, (text, label) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_labels.append(label)
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_labels = to_categorical(batch_labels, num_classes)
+                yield [batch_token_ids, batch_segment_ids], batch_labels
+                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+
+
+# 转换数据集
+train_generator = data_generator(train_data, batch_size)
+valid_generator = data_generator(valid_data, batch_size)
+test_generator = data_generator(test_data, batch_size)
+
+# 加载预训练模型
+bert = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    return_keras_model=False,
+)
+
+output = Lambda(lambda x: x[:, 0])(bert.model.output)
+output = Dense(
+    units=num_classes,
+    activation='softmax',
+    kernel_initializer=bert.initializer
+)(output)
+
+# 用于正常训练的模型
+model = keras.models.Model(bert.model.input, output)
+model.summary()
+
+model.compile(
+    loss='kld',
+    optimizer=Adam(2e-5),
+    metrics=['categorical_accuracy'],
+)
+
+# 用于虚拟对抗训练的模型
+model_vat = keras.models.Model(bert.model.input, output)
+model_vat.compile(
+    loss='kld',
+    optimizer=Adam(1e-5),
+    metrics=['categorical_accuracy'],
+)
+
+
+def virtual_adversarial_training(
+    model, embedding_name, epsilon=1, xi=10, iters=1
+):
+    """给模型添加虚拟对抗训练
+    其中model是需要添加对抗训练的keras模型，embedding_name
+    则是model里边Embedding层的名字。要在模型compile之后使用。
+    """
+    if model.train_function is None:  # 如果还没有训练函数
+        model._make_train_function()  # 手动make
+    old_train_function = model.train_function  # 备份旧的训练函数
+
+    # 查找Embedding层
+    for output in model.outputs:
+        embedding_layer = search_layer(output, embedding_name)
+        if embedding_layer is not None:
+            break
+    if embedding_layer is None:
+        raise Exception('Embedding layer not found')
+
+    # 求Embedding梯度
+    embeddings = embedding_layer.embeddings  # Embedding矩阵
+    gradients = K.gradients(model.total_loss, [embeddings])  # Embedding梯度
+    gradients = K.zeros_like(embeddings) + gradients[0]  # 转为dense tensor
+
+    # 封装为函数
+    inputs = (
+        model._feed_inputs + model._feed_targets + model._feed_sample_weights
+    )  # 所有输入层
+    model_outputs = K.function(
+        inputs=inputs,
+        outputs=model.outputs,
+        name='model_outputs',
+    )  # 模型输出函数
+    embedding_gradients = K.function(
+        inputs=inputs,
+        outputs=[gradients],
+        name='embedding_gradients',
+    )  # 模型梯度函数
+
+    def l2_normalize(x):
+        return x / (np.sqrt((x**2).sum()) + 1e-8)
+
+    def train_function(inputs):  # 重新定义训练函数
+        outputs = model_outputs(inputs)
+        inputs = inputs[:2] + outputs + inputs[3:]
+        delta1, delta2 = 0.0, np.random.randn(*K.int_shape(embeddings))
+        for _ in range(iters):  # 迭代求扰动
+            delta2 = xi * l2_normalize(delta2)
+            K.set_value(embeddings, K.eval(embeddings) - delta1 + delta2)
+            delta1 = delta2
+            delta2 = embedding_gradients(inputs)[0]  # Embedding梯度
+        delta2 = epsilon * l2_normalize(delta2)
+        K.set_value(embeddings, K.eval(embeddings) - delta1 + delta2)
+        outputs = old_train_function(inputs)  # 梯度下降
+        K.set_value(embeddings, K.eval(embeddings) - delta2)  # 删除扰动
+        return outputs
+
+    model.train_function = train_function  # 覆盖原训练函数
+
+
+# 写好函数后，启用对抗训练只需要一行代码
+virtual_adversarial_training(model_vat, 'Embedding-Token')
+
+
+def evaluate(data):
+    total, right = 0., 0.
+    for x_true, y_true in data:
+        y_pred = model.predict(x_true).argmax(axis=1)
+        y_true = y_true.argmax(axis=1)
+        total += len(y_true)
+        right += (y_true == y_pred).sum()
+    return right / total
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0.
+        self.data = data_generator(unlabeled_data, batch_size).forfit()
+
+    def on_epoch_end(self, epoch, logs=None):
+        val_acc = evaluate(valid_generator)
+        if val_acc > self.best_val_acc:
+            self.best_val_acc = val_acc
+            model.save_weights('best_model.weights')
+        test_acc = evaluate(test_generator)
+        print(
+            u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
+            (val_acc, self.best_val_acc, test_acc)
+        )
+
+    def on_batch_end(self, batch, logs=None):
+        if use_vat:
+            dx, dy = next(self.data)
+            model_vat.train_on_batch(dx, dy)
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=30,
+        epochs=100,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_ape210k_math_word_problem.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_ape210k_math_word_problem.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7a59814619b0e99fd0b823e3b1bf1ed537f265d
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_ape210k_math_word_problem.py
@@ -0,0 +1,302 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 用Seq2Seq做小学数学应用题
+# 数据集为ape210k：https://github.com/Chenny0808/ape210k
+# Base版准确率为70%+，Large版准确率为73%+
+# 实测环境：tensorflow 1.14 + keras 2.3.1 + bert4keras 0.8.8
+# 介绍链接：https://kexue.fm/archives/7809
+
+from __future__ import division
+import json, re
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, open
+from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
+from keras.models import Model
+from sympy import Integer
+
+# 基本参数
+maxlen = 192
+batch_size = 32
+epochs = 100
+
+# bert配置
+config_path = '/root/kg/bert/uer/mixed_corpus_bert_base_model/bert_config.json'
+checkpoint_path = '/root/kg/bert/uer/mixed_corpus_bert_base_model/bert_model.ckpt'
+dict_path = '/root/kg/bert/uer/mixed_corpus_bert_base_model/vocab.txt'
+
+
+def is_equal(a, b):
+    """比较两个结果是否相等
+    """
+    a = round(float(a), 6)
+    b = round(float(b), 6)
+    return a == b
+
+
+def remove_bucket(equation):
+    """去掉冗余的括号
+    """
+    l_buckets, buckets = [], []
+    for i, c in enumerate(equation):
+        if c == '(':
+            l_buckets.append(i)
+        elif c == ')':
+            buckets.append((l_buckets.pop(), i))
+    eval_equation = eval(equation)
+    for l, r in buckets:
+        new_equation = '%s %s %s' % (
+            equation[:l], equation[l + 1:r], equation[r + 1:]
+        )
+        try:
+            if is_equal(eval(new_equation.replace(' ', '')), eval_equation):
+                equation = new_equation
+        except:
+            pass
+    return equation.replace(' ', '')
+
+
+def load_data(filename):
+    """读取训练数据，并做一些标准化，保证equation是可以eval的
+    参考：https://kexue.fm/archives/7809
+    """
+    D = []
+    for l in open(filename):
+        l = json.loads(l)
+        question, equation, answer = l['original_text'], l['equation'], l['ans']
+        # 处理带分数
+        question = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', question)
+        equation = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', equation)
+        answer = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', answer)
+        equation = re.sub('(\d+)\(', '\\1+(', equation)
+        answer = re.sub('(\d+)\(', '\\1+(', answer)
+        # 分数去括号
+        question = re.sub('\((\d+/\d+)\)', '\\1', question)
+        # 处理百分数
+        equation = re.sub('([\.\d]+)%', '(\\1/100)', equation)
+        answer = re.sub('([\.\d]+)%', '(\\1/100)', answer)
+        # 冒号转除号、剩余百分号处理
+        equation = equation.replace(':', '/').replace('%', '/100')
+        answer = answer.replace(':', '/').replace('%', '/100')
+        if equation[:2] == 'x=':
+            equation = equation[2:]
+        try:
+            if is_equal(eval(equation), eval(answer)):
+                D.append((question, remove_bucket(equation), answer))
+        except:
+            continue
+    return D
+
+
+# 加载数据集
+train_data = load_data('/root/ape210k/train.ape.json')
+valid_data = load_data('/root/ape210k/valid.ape.json')
+test_data = load_data('/root/ape210k/test.ape.json')
+
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids = [], []
+        for is_end, (question, equation, answer) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(
+                question, equation, maxlen=maxlen
+            )
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                yield [batch_token_ids, batch_segment_ids], None
+                batch_token_ids, batch_segment_ids = [], []
+
+
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉输入部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_mask, y_pred = inputs
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    application='unilm',
+    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+)
+
+output = CrossEntropy(2)(model.inputs + model.outputs)
+
+model = Model(model.inputs, output)
+model.compile(optimizer=Adam(2e-5))
+model.summary()
+
+
+class AutoSolve(AutoRegressiveDecoder):
+    """seq2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids, segment_ids = inputs
+        token_ids = np.concatenate([token_ids, output_ids], 1)
+        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
+        return self.last_token(model).predict([token_ids, segment_ids])
+
+    def generate(self, text, topk=1):
+        token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
+        output_ids = self.beam_search([token_ids, segment_ids],
+                                      topk=topk)  # 基于beam search
+        return tokenizer.decode(output_ids).replace(' ', '')
+
+
+autosolve = AutoSolve(start_id=None, end_id=tokenizer._token_end_id, maxlen=64)
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_acc = 0.
+
+    def on_epoch_end(self, epoch, logs=None):
+        metrics = self.evaluate(valid_data)  # 评测模型
+        if metrics['acc'] >= self.best_acc:
+            self.best_acc = metrics['acc']
+            model.save_weights('./best_model.weights')  # 保存模型
+        metrics['best_acc'] = self.best_acc
+        print('valid_data:', metrics)
+
+    def evaluate(self, data, topk=1):
+        total, right = 0.0, 0.0
+        for question, equation, answer in tqdm(data):
+            total += 1
+            pred_equation = autosolve.generate(question, topk)
+            try:
+                right += int(is_equal(eval(pred_equation), eval(answer)))
+            except:
+                pass
+        return {'acc': right / total}
+
+
+def predict(in_file, out_file, topk=1):
+    """输出预测结果到文件
+    该函数主要为比赛 https://www.datafountain.cn/competitions/467 所写，
+    主要是读取该比赛的测试集，然后预测equation，并且根据不同的问题输出不同格式的答案，
+    out_file可以直接提交到线上评测，线上准确率可以达到38%+。
+    """
+    fw = open(out_file, 'w', encoding='utf-8')
+    raw_data = pd.read_csv(in_file, header=None, encoding='utf-8')
+    for i, question in tqdm(raw_data.values):
+        question = re.sub('(\d+)_(\d+/\d+)', '(\\1+\\2)', question)
+        pred_equation = autosolve.generate(question, topk)
+        if '.' not in pred_equation:
+            pred_equation = re.sub('([\d]+)', 'Integer(\\1)', pred_equation)
+        try:
+            pred_answer = eval(pred_equation)
+        except:
+            pred_answer = np.random.choice(21) + 1
+        if '.' in pred_equation:
+            if u'百分之几' in question:
+                pred_answer = pred_answer * 100
+            pred_answer = round(pred_answer, 2)
+            if int(pred_answer) == pred_answer:
+                pred_answer = int(pred_answer)
+            if (
+                re.findall(u'多少[辆|人|个|只|箱|包本|束|头|盒|张]', question) or
+                re.findall(u'几[辆|人|个|只|箱|包|本|束|头|盒|张]', question)
+            ):
+                if re.findall(u'至少|最少', question):
+                    pred_answer = np.ceil(pred_answer)
+                elif re.findall(u'至多|最多', question):
+                    pred_answer = np.floor(pred_answer)
+                else:
+                    pred_answer = np.ceil(pred_answer)
+                pred_answer = int(pred_answer)
+            pred_answer = str(pred_answer)
+            if u'百分之几' in question:
+                pred_answer = pred_answer + '%'
+        else:
+            pred_answer = str(pred_answer)
+            if '/' in pred_answer:
+                if re.findall('\d+/\d+', question):
+                    a, b = pred_answer.split('/')
+                    a, b = int(a), int(b)
+                    if a > b:
+                        pred_answer = '%s_%s/%s' % (a // b, a % b, b)
+                else:
+                    if re.findall(u'至少|最少', question):
+                        pred_answer = np.ceil(eval(pred_answer))
+                    elif re.findall(u'至多|最多', question):
+                        pred_answer = np.floor(eval(pred_answer))
+                    else:
+                        pred_answer = np.ceil(eval(pred_answer))
+                    pred_answer = str(int(pred_answer))
+        fw.write(str(i) + ',' + pred_answer + '\n')
+        fw.flush()
+    fw.close()
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(train_data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e11e3ac2728142c3066afaddb6febcc70e56ce1
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle.py
@@ -0,0 +1,177 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# bert做Seq2Seq任务，采用UNILM方案
+# 介绍链接：https://kexue.fm/archives/6933
+
+from __future__ import print_function
+import glob
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, open
+from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
+from keras.models import Model
+
+# 基本参数
+maxlen = 256
+batch_size = 16
+steps_per_epoch = 1000
+epochs = 10000
+
+# bert配置
+config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt'
+
+# 训练样本。THUCNews数据集，每个样本保存为一个txt。
+txts = glob.glob('/root/thuctc/THUCNews/*/*.txt')
+
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids = [], []
+        for is_end, txt in self.sample(random):
+            text = open(txt, encoding='utf-8').read()
+            text = text.split('\n')
+            if len(text) > 1:
+                title = text[0]
+                content = '\n'.join(text[1:])
+                token_ids, segment_ids = tokenizer.encode(
+                    content, title, maxlen=maxlen
+                )
+                batch_token_ids.append(token_ids)
+                batch_segment_ids.append(segment_ids)
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                yield [batch_token_ids, batch_segment_ids], None
+                batch_token_ids, batch_segment_ids = [], []
+
+
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉输入部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_mask, y_pred = inputs
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    application='unilm',
+    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+)
+
+output = CrossEntropy(2)(model.inputs + model.outputs)
+
+model = Model(model.inputs, output)
+model.compile(optimizer=Adam(1e-5))
+model.summary()
+
+
+class AutoTitle(AutoRegressiveDecoder):
+    """seq2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids, segment_ids = inputs
+        token_ids = np.concatenate([token_ids, output_ids], 1)
+        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
+        return self.last_token(model).predict([token_ids, segment_ids])
+
+    def generate(self, text, topk=1):
+        max_c_len = maxlen - self.maxlen
+        token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
+        output_ids = self.beam_search([token_ids, segment_ids],
+                                      topk=topk)  # 基于beam search
+        return tokenizer.decode(output_ids)
+
+
+autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32)
+
+
+def just_show():
+    s1 = u'夏天来临，皮肤在强烈紫外线的照射下，晒伤不可避免，因此，晒后及时修复显得尤为重要，否则可能会造成长期伤害。专家表示，选择晒后护肤品要慎重，芦荟凝胶是最安全，有效的一种选择，晒伤严重者，还请及 时 就医 。'
+    s2 = u'8月28日，网络爆料称，华住集团旗下连锁酒店用户数据疑似发生泄露。从卖家发布的内容看，数据包含华住旗下汉庭、禧玥、桔子、宜必思等10余个品牌酒店的住客信息。泄露的信息包括华住官网注册资料、酒店入住登记的身份信息及酒店开房记录，住客姓名、手机号、邮箱、身份证号、登录账号密码等。卖家对这个约5亿条数据打包出售。第三方安全平台威胁猎人对信息出售者提供的三万条数据进行验证，认为数据真实性非常高。当天下午 ，华 住集 团发声明称，已在内部迅速开展核查，并第一时间报警。当晚，上海警方消息称，接到华住集团报案，警方已经介入调查。'
+    for s in [s1, s2]:
+        print(u'生成标题:', autotitle.generate(s))
+    print()
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.lowest = 1e10
+
+    def on_epoch_end(self, epoch, logs=None):
+        # 保存最优
+        if logs['loss'] <= self.lowest:
+            self.lowest = logs['loss']
+            model.save_weights('./best_model.weights')
+        # 演示效果
+        just_show()
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(txts, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle_csl.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle_csl.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba20852969ad15340dbc848662987bbdd0100482
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle_csl.py
@@ -0,0 +1,212 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# bert做Seq2Seq任务，采用UNILM方案
+# 介绍链接：https://kexue.fm/archives/6933
+# 数据集：https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
+# 补充了评测指标bleu、rouge-1、rouge-2、rouge-l
+
+from __future__ import print_function
+import numpy as np
+from tqdm import tqdm
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, open
+from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
+from keras.models import Model
+from rouge import Rouge  # pip install rouge
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+
+# 基本参数
+maxlen = 256
+batch_size = 16
+epochs = 20
+
+# bert配置
+config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：(标题, 正文)
+    """
+    D = []
+    with open(filename, encoding='utf-8') as f:
+        for l in f:
+            title, content = l.strip().split('\t')
+            D.append((title, content))
+    return D
+
+
+# 加载数据集
+train_data = load_data('/root/csl/train.tsv')
+valid_data = load_data('/root/csl/val.tsv')
+test_data = load_data('/root/csl/test.tsv')
+
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids = [], []
+        for is_end, (title, content) in self.sample(random):
+            token_ids, segment_ids = tokenizer.encode(
+                content, title, maxlen=maxlen
+            )
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                yield [batch_token_ids, batch_segment_ids], None
+                batch_token_ids, batch_segment_ids = [], []
+
+
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉输入部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_mask, y_pred = inputs
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    application='unilm',
+    keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+)
+
+output = CrossEntropy(2)(model.inputs + model.outputs)
+
+model = Model(model.inputs, output)
+model.compile(optimizer=Adam(1e-5))
+model.summary()
+
+
+class AutoTitle(AutoRegressiveDecoder):
+    """seq2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids, segment_ids = inputs
+        token_ids = np.concatenate([token_ids, output_ids], 1)
+        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
+        return self.last_token(model).predict([token_ids, segment_ids])
+
+    def generate(self, text, topk=1):
+        max_c_len = maxlen - self.maxlen
+        token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
+        output_ids = self.beam_search([token_ids, segment_ids],
+                                      topk=topk)  # 基于beam search
+        return tokenizer.decode(output_ids)
+
+
+autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32)
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.rouge = Rouge()
+        self.smooth = SmoothingFunction().method1
+        self.best_bleu = 0.
+
+    def on_epoch_end(self, epoch, logs=None):
+        metrics = self.evaluate(valid_data)  # 评测模型
+        if metrics['bleu'] > self.best_bleu:
+            self.best_bleu = metrics['bleu']
+            model.save_weights('./best_model.weights')  # 保存模型
+        metrics['best_bleu'] = self.best_bleu
+        print('valid_data:', metrics)
+
+    def evaluate(self, data, topk=1):
+        total = 0
+        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
+        for title, content in tqdm(data):
+            total += 1
+            title = ' '.join(title).lower()
+            pred_title = ' '.join(autotitle.generate(content, topk)).lower()
+            if pred_title.strip():
+                scores = self.rouge.get_scores(hyps=pred_title, refs=title)
+                rouge_1 += scores[0]['rouge-1']['f']
+                rouge_2 += scores[0]['rouge-2']['f']
+                rouge_l += scores[0]['rouge-l']['f']
+                bleu += sentence_bleu(
+                    references=[title.split(' ')],
+                    hypothesis=pred_title.split(' '),
+                    smoothing_function=self.smooth
+                )
+        rouge_1 /= total
+        rouge_2 /= total
+        rouge_l /= total
+        bleu /= total
+        return {
+            'rouge-1': rouge_1,
+            'rouge-2': rouge_2,
+            'rouge-l': rouge_l,
+            'bleu': bleu,
+        }
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(train_data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle_csl_mt5.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle_csl_mt5.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab6a7a10790c50c4aaeae194ad1edfc055e95d26
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle_csl_mt5.py
@@ -0,0 +1,218 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 微调多国语言版T5做Seq2Seq任务
+# 介绍链接：https://kexue.fm/archives/7867
+# 细节请看：https://github.com/bojone/t5_in_bert4keras
+# 数据集：https://github.com/CLUEbenchmark/CLGE 中的CSL数据集
+# 补充了评测指标bleu、rouge-1、rouge-2、rouge-l
+
+from __future__ import print_function
+import json
+import numpy as np
+from tqdm import tqdm
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import SpTokenizer
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, open
+from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
+from keras.models import Model
+from rouge import Rouge  # pip install rouge
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+
+# 基本参数
+max_c_len = 256
+max_t_len = 32
+batch_size = 16
+epochs = 40
+
+# 模型路径
+config_path = '/root/kg/bert/mt5/mt5_base/mt5_base_config.json'
+checkpoint_path = '/root/kg/bert/mt5/mt5_base/model.ckpt-1000000'
+spm_path = '/root/kg/bert/mt5/sentencepiece_cn.model'
+keep_tokens_path = '/root/kg/bert/mt5/sentencepiece_cn_keep_tokens.json'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：(标题, 正文)
+    """
+    D = []
+    with open(filename, encoding='utf-8') as f:
+        for l in f:
+            title, content = l.strip().split('\t')
+            D.append((title, content))
+    return D
+
+
+# 加载数据集
+train_data = load_data('/root/csl/train.tsv')
+valid_data = load_data('/root/csl/val.tsv')
+test_data = load_data('/root/csl/test.tsv')
+
+# 加载分词器
+tokenizer = SpTokenizer(spm_path, token_start=None, token_end='</s>')
+keep_tokens = json.load(open(keep_tokens_path))
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_c_token_ids, batch_t_token_ids = [], []
+        for is_end, (title, content) in self.sample(random):
+            c_token_ids, _ = tokenizer.encode(content, maxlen=max_c_len)
+            t_token_ids, _ = tokenizer.encode(title, maxlen=max_t_len)
+            batch_c_token_ids.append(c_token_ids)
+            batch_t_token_ids.append([0] + t_token_ids)
+            if len(batch_c_token_ids) == self.batch_size or is_end:
+                batch_c_token_ids = sequence_padding(batch_c_token_ids)
+                batch_t_token_ids = sequence_padding(batch_t_token_ids)
+                yield [batch_c_token_ids, batch_t_token_ids], None
+                batch_c_token_ids, batch_t_token_ids = [], []
+
+
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉输入部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_pred = inputs
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_mask = K.cast(mask[1], K.floatx())[:, 1:]  # 解码器自带mask
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+
+
+t5 = build_transformer_model(
+    config_path=config_path,
+    checkpoint_path=checkpoint_path,
+    keep_tokens=keep_tokens,
+    model='mt5.1.1',
+    return_keras_model=False,
+    name='T5',
+)
+
+encoder = t5.encoder
+decoder = t5.decoder
+model = t5.model
+model.summary()
+
+output = CrossEntropy(1)([model.inputs[1], model.outputs[0]])
+
+model = Model(model.inputs, output)
+model.compile(optimizer=Adam(2e-4))
+
+
+class AutoTitle(AutoRegressiveDecoder):
+    """seq2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        c_encoded = inputs[0]
+        return self.last_token(decoder).predict([c_encoded, output_ids])
+
+    def generate(self, text, topk=1):
+        c_token_ids, _ = tokenizer.encode(text, maxlen=max_c_len)
+        c_encoded = encoder.predict(np.array([c_token_ids]))[0]
+        output_ids = self.beam_search([c_encoded], topk=topk)  # 基于beam search
+        return tokenizer.decode([int(i) for i in output_ids])
+
+
+# 注：T5有一个很让人不解的设置，它的<bos>标记id是0，即<bos>和<pad>其实都是0
+autotitle = AutoTitle(
+    start_id=0, end_id=tokenizer._token_end_id, maxlen=max_t_len
+)
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.rouge = Rouge()
+        self.smooth = SmoothingFunction().method1
+        self.best_bleu = 0.
+
+    def on_epoch_end(self, epoch, logs=None):
+        metrics = self.evaluate(valid_data)  # 评测模型
+        if metrics['bleu'] > self.best_bleu:
+            self.best_bleu = metrics['bleu']
+            model.save_weights('./best_model.weights')  # 保存模型
+        metrics['best_bleu'] = self.best_bleu
+        print('valid_data:', metrics)
+
+    def evaluate(self, data, topk=1):
+        total = 0
+        rouge_1, rouge_2, rouge_l, bleu = 0, 0, 0, 0
+        for title, content in tqdm(data):
+            total += 1
+            title = ' '.join(title).lower()
+            pred_title = ' '.join(autotitle.generate(content,
+                                                     topk=topk)).lower()
+            if pred_title.strip():
+                scores = self.rouge.get_scores(hyps=pred_title, refs=title)
+                rouge_1 += scores[0]['rouge-1']['f']
+                rouge_2 += scores[0]['rouge-2']['f']
+                rouge_l += scores[0]['rouge-l']['f']
+                bleu += sentence_bleu(
+                    references=[title.split(' ')],
+                    hypothesis=pred_title.split(' '),
+                    smoothing_function=self.smooth
+                )
+        rouge_1 /= total
+        rouge_2 /= total
+        rouge_l /= total
+        bleu /= total
+        return {
+            'rouge-1': rouge_1,
+            'rouge-2': rouge_2,
+            'rouge-l': rouge_l,
+            'bleu': bleu,
+        }
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(train_data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle_multigpu.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle_multigpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5b165c161d5243fa8c6fc1615b9aadb8e0c4e6c
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_seq2seq_autotitle_multigpu.py
@@ -0,0 +1,191 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# bert做Seq2Seq任务，采用UNILM方案
+# 介绍链接：https://kexue.fm/archives/6933
+# 单机多卡版本，读者可以对照着 task_seq2seq_autotitle.py 阅读
+
+from __future__ import print_function
+
+import os
+os.environ['TF_KERAS'] = '1'  # 必须使用tf.keras
+
+import glob
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.layers import Loss
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer, load_vocab
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, open
+from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
+from keras.models import Model
+import tensorflow as tf  # 导入tf，备用
+
+# 基本参数
+maxlen = 256
+batch_size = 64
+steps_per_epoch = 1000
+epochs = 10000
+
+# bert配置
+config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt'
+
+# 训练样本。THUCNews数据集，每个样本保存为一个txt。
+txts = glob.glob('/root/thuctc/THUCNews/*/*.txt')
+
+# 加载并精简词表，建立分词器
+token_dict, keep_tokens = load_vocab(
+    dict_path=dict_path,
+    simplified=True,
+    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
+)
+tokenizer = Tokenizer(token_dict, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    （每次只需要返回一条样本）
+    """
+    def __iter__(self, random=False):
+        for is_end, txt in self.sample(random):
+            text = open(txt, encoding='utf-8').read()
+            text = text.split('\n')
+            if len(text) > 1:
+                title = text[0]
+                content = '\n'.join(text[1:])
+                token_ids, segment_ids = tokenizer.encode(
+                    content, title, maxlen=maxlen
+                )
+                # 返回一条样本
+                yield token_ids, segment_ids
+
+
+class CrossEntropy(Loss):
+    """交叉熵作为loss，并mask掉输入部分
+    """
+    def compute_loss(self, inputs, mask=None):
+        y_true, y_mask, y_pred = inputs
+        y_true = y_true[:, 1:]  # 目标token_ids
+        y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
+        y_pred = y_pred[:, :-1]  # 预测序列，错开一位
+        loss = K.sparse_categorical_crossentropy(y_true, y_pred)
+        loss = K.sum(loss * y_mask) / K.sum(y_mask)
+        return loss
+
+
+strategy = tf.distribute.MirroredStrategy()  # 建立单机多卡策略
+
+with strategy.scope():  # 调用该策略
+
+    bert = build_transformer_model(
+        config_path,
+        checkpoint_path=None,  # 此时可以不加载预训练权重
+        application='unilm',
+        keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
+        return_keras_model=False,  # 返回bert4keras类，而不是keras模型
+    )
+
+    model = bert.model  # 这个才是keras模型
+    output = CrossEntropy(2)(model.inputs + model.outputs)
+
+    model = Model(model.inputs, output)
+    model.compile(optimizer=Adam(1e-5))
+    model.summary()
+    bert.load_weights_from_checkpoint(checkpoint_path)  # 必须最后才加载预训练权重
+
+
+class AutoTitle(AutoRegressiveDecoder):
+    """seq2seq解码器
+    """
+    @AutoRegressiveDecoder.wraps(default_rtype='probas')
+    def predict(self, inputs, output_ids, states):
+        token_ids, segment_ids = inputs
+        token_ids = np.concatenate([token_ids, output_ids], 1)
+        segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
+        return self.last_token(model).predict([token_ids, segment_ids])
+
+    def generate(self, text, topk=1):
+        max_c_len = maxlen - self.maxlen
+        token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
+        output_ids = self.beam_search([token_ids, segment_ids],
+                                      topk=topk)  # 基于beam search
+        return tokenizer.decode(output_ids)
+
+
+autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=32)
+
+
+def just_show():
+    s1 = u'夏天来临，皮肤在强烈紫外线的照射下，晒伤不可避免，因此，晒后及时修复显得尤为重要，否则可能会造成长期伤害。专家表示，选择晒后护肤品要慎重，芦荟凝胶是最安全，有效的一种选择，晒伤严重者，还请及 时 就医 。'
+    s2 = u'8月28日，网络爆料称，华住集团旗下连锁酒店用户数据疑似发生泄露。从卖家发布的内容看，数据包含华住旗下汉庭、禧玥、桔子、宜必思等10余个品牌酒店的住客信息。泄露的信息包括华住官网注册资料、酒店入住登记的身份信息及酒店开房记录，住客姓名、手机号、邮箱、身份证号、登录账号密码等。卖家对这个约5亿条数据打包出售。第三方安全平台威胁猎人对信息出售者提供的三万条数据进行验证，认为数据真实性非常高。当天下午 ，华 住集 团发声明称，已在内部迅速开展核查，并第一时间报警。当晚，上海警方消息称，接到华住集团报案，警方已经介入调查。'
+    for s in [s1, s2]:
+        print(u'生成标题:', autotitle.generate(s))
+    print()
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.lowest = 1e10
+
+    def on_epoch_end(self, epoch, logs=None):
+        # 保存最优
+        if logs['loss'] <= self.lowest:
+            self.lowest = logs['loss']
+            model.save_weights('./best_model.weights')
+        # 演示效果
+        just_show()
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(txts, batch_size)
+    dataset = train_generator.to_dataset(
+        types=('float32', 'float32'),
+        shapes=([None], [None]),  # 配合后面的padded_batch=True，实现自动padding
+        names=('Input-Token', 'Input-Segment'),
+        padded_batch=True
+    )  # 数据要转为tf.data.Dataset格式，names跟输入层的名字对应
+
+    model.fit(
+        dataset,
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sequence_labeling_cws_crf.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sequence_labeling_cws_crf.py
new file mode 100644
index 0000000000000000000000000000000000000000..47b6e9827d61f975c7491fcabe1afd0cae350dd9
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sequence_labeling_cws_crf.py
@@ -0,0 +1,252 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 用CRF做中文分词（CWS, Chinese Word Segmentation）
+# 数据集 http://sighan.cs.uchicago.edu/bakeoff2005/
+# 最后测试集的F1约为96.1%
+
+import re, os, json
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open, ViterbiDecoder, to_array
+from bert4keras.layers import ConditionalRandomField
+from keras.layers import Dense
+from keras.models import Model
+from tqdm import tqdm
+
+maxlen = 256
+epochs = 10
+num_labels = 4
+batch_size = 32
+bert_layers = 12
+learning_rate = 1e-5  # bert_layers越小，学习率应该要越大
+crf_lr_multiplier = 1  # 必要时扩大CRF层的学习率
+
+# bert配置
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：[词1, 词2, 词3, ...]
+    """
+    D = []
+    with open(filename, encoding='utf-8') as f:
+        for l in f:
+            D.append(re.split(' +', l.strip()))
+    return D
+
+
+# 标注数据
+data = load_data('/root/icwb2-data/training/pku_training.utf8')
+
+# 保存一个随机序（供划分valid用）
+if not os.path.exists('../random_order.json'):
+    random_order = list(range(len(data)))
+    np.random.shuffle(random_order)
+    json.dump(random_order, open('../random_order.json', 'w'), indent=4)
+else:
+    random_order = json.load(open('../random_order.json'))
+
+# 划分valid
+train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
+valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        """标签含义
+        0: 单字词； 1: 多字词首字； 2: 多字词中间； 3: 多字词末字
+        """
+        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+        for is_end, item in self.sample(random):
+            token_ids, labels = [tokenizer._token_start_id], [0]
+            for w in item:
+                w_token_ids = tokenizer.encode(w)[0][1:-1]
+                if len(token_ids) + len(w_token_ids) < maxlen:
+                    token_ids += w_token_ids
+                    if len(w_token_ids) == 1:
+                        labels += [0]
+                    else:
+                        labels += [1] + [2] * (len(w_token_ids) - 2) + [3]
+                else:
+                    break
+            token_ids += [tokenizer._token_end_id]
+            labels += [0]
+            segment_ids = [0] * len(token_ids)
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_labels.append(labels)
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_labels = sequence_padding(batch_labels)
+                yield [batch_token_ids, batch_segment_ids], batch_labels
+                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+
+
+"""
+后面的代码使用的是bert类型的模型，如果你用的是albert，那么前几行请改为：
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    model='albert',
+)
+
+output_layer = 'Transformer-FeedForward-Norm'
+output = model.get_layer(output_layer).get_output_at(bert_layers - 1)
+"""
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+)
+
+output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)
+output = model.get_layer(output_layer).output
+output = Dense(num_labels)(output)
+CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
+output = CRF(output)
+
+model = Model(model.input, output)
+model.summary()
+
+model.compile(
+    loss=CRF.sparse_loss,
+    optimizer=Adam(learning_rate),
+    metrics=[CRF.sparse_accuracy]
+)
+
+
+class WordSegmenter(ViterbiDecoder):
+    """基本分词器
+    """
+    def tokenize(self, text):
+        tokens = tokenizer.tokenize(text)
+        while len(tokens) > 512:
+            tokens.pop(-2)
+        mapping = tokenizer.rematch(text, tokens)
+        token_ids = tokenizer.tokens_to_ids(tokens)
+        segment_ids = [0] * len(token_ids)
+        token_ids, segment_ids = to_array([token_ids], [segment_ids])
+        nodes = model.predict([token_ids, segment_ids])[0]
+        labels = self.decode(nodes)
+        words = []
+        for i, label in enumerate(labels[1:-1]):
+            if label < 2 or len(words) == 0:
+                words.append([i + 1])
+            else:
+                words[-1].append(i + 1)
+        return [text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1] for w in words]
+
+
+segmenter = WordSegmenter(trans=K.eval(CRF.trans), starts=[0], ends=[0])
+
+
+def simple_evaluate(data):
+    """简单的评测
+    该评测指标不等价于官方的评测指标，但基本呈正相关关系，
+    可以用来快速筛选模型。
+    """
+    total, right = 0., 0.
+    for w_true in tqdm(data):
+        w_pred = segmenter.tokenize(''.join(w_true))
+        w_pred = set(w_pred)
+        w_true = set(w_true)
+        total += len(w_true)
+        right += len(w_true & w_pred)
+    return right / total
+
+
+def predict_to_file(in_file, out_file):
+    """预测结果到文件，便于用官方脚本评测
+    使用示例：
+    predict_to_file('/root/icwb2-data/testing/pku_test.utf8', 'myresult.txt')
+    官方评测代码示例：
+    data_dir="/root/icwb2-data"
+    $data_dir/scripts/score $data_dir/gold/pku_training_words.utf8 $data_dir/gold/pku_test_gold.utf8 myresult.txt > myscore.txt
+    （执行完毕后查看myscore.txt的内容末尾）
+    """
+    fw = open(out_file, 'w', encoding='utf-8')
+    with open(in_file, encoding='utf-8') as fr:
+        for l in tqdm(fr):
+            l = l.strip()
+            if l:
+                l = ' '.join(segmenter.tokenize(l))
+            fw.write(l + '\n')
+    fw.close()
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_acc = 0
+
+    def on_epoch_end(self, epoch, logs=None):
+        trans = K.eval(CRF.trans)
+        segmenter.trans = trans
+        print(segmenter.trans)
+        acc = simple_evaluate(valid_data)
+        # 保存最优
+        if acc >= self.best_val_acc:
+            self.best_val_acc = acc
+            model.save_weights('./best_model.weights')
+        print('acc: %.5f, best acc: %.5f' % (acc, self.best_val_acc))
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(train_data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
+    segmenter.trans = K.eval(CRF.trans)
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sequence_labeling_ner_crf.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sequence_labeling_ner_crf.py
new file mode 100644
index 0000000000000000000000000000000000000000..570a5360b68836c2cff488ec56bcad4c589d8425
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/examples/task_sequence_labeling_ner_crf.py
@@ -0,0 +1,241 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 用CRF做中文命名实体识别
+# 数据集 http://s3.bmio.net/kashgari/china-people-daily-ner-corpus.tar.gz
+# 实测验证集的F1可以到96.48%，测试集的F1可以到95.38%
+
+import numpy as np
+from bert4keras.backend import keras, K
+from bert4keras.models import build_transformer_model
+from bert4keras.tokenizers import Tokenizer
+from bert4keras.optimizers import Adam
+from bert4keras.snippets import sequence_padding, DataGenerator
+from bert4keras.snippets import open, ViterbiDecoder, to_array
+from bert4keras.layers import ConditionalRandomField
+from keras.layers import Dense
+from keras.models import Model
+from tqdm import tqdm
+
+maxlen = 256
+epochs = 10
+batch_size = 32
+bert_layers = 12
+learning_rate = 2e-5  # bert_layers越小，学习率应该要越大
+crf_lr_multiplier = 1000  # 必要时扩大CRF层的学习率
+categories = set()
+
+# bert配置
+config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
+dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'
+
+
+def load_data(filename):
+    """加载数据
+    单条格式：[text, (start, end, label), (start, end, label), ...]，
+              意味着text[start:end + 1]是类型为label的实体。
+    """
+    D = []
+    with open(filename, encoding='utf-8') as f:
+        f = f.read()
+        for l in f.split('\n\n'):
+            if not l:
+                continue
+            d = ['']
+            for i, c in enumerate(l.split('\n')):
+                char, flag = c.split(' ')
+                d[0] += char
+                if flag[0] == 'B':
+                    d.append([i, i, flag[2:]])
+                    categories.add(flag[2:])
+                elif flag[0] == 'I':
+                    d[-1][1] = i
+            D.append(d)
+    return D
+
+
+# 标注数据
+train_data = load_data('/root/ner/china-people-daily-ner-corpus/example.train')
+valid_data = load_data('/root/ner/china-people-daily-ner-corpus/example.dev')
+test_data = load_data('/root/ner/china-people-daily-ner-corpus/example.test')
+categories = list(sorted(categories))
+
+# 建立分词器
+tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+
+class data_generator(DataGenerator):
+    """数据生成器
+    """
+    def __iter__(self, random=False):
+        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+        for is_end, d in self.sample(random):
+            tokens = tokenizer.tokenize(d[0], maxlen=maxlen)
+            mapping = tokenizer.rematch(d[0], tokens)
+            start_mapping = {j[0]: i for i, j in enumerate(mapping) if j}
+            end_mapping = {j[-1]: i for i, j in enumerate(mapping) if j}
+            token_ids = tokenizer.tokens_to_ids(tokens)
+            segment_ids = [0] * len(token_ids)
+            labels = np.zeros(len(token_ids))
+            for start, end, label in d[1:]:
+                if start in start_mapping and end in end_mapping:
+                    start = start_mapping[start]
+                    end = end_mapping[end]
+                    labels[start] = categories.index(label) * 2 + 1
+                    labels[start + 1:end + 1] = categories.index(label) * 2 + 2
+            batch_token_ids.append(token_ids)
+            batch_segment_ids.append(segment_ids)
+            batch_labels.append(labels)
+            if len(batch_token_ids) == self.batch_size or is_end:
+                batch_token_ids = sequence_padding(batch_token_ids)
+                batch_segment_ids = sequence_padding(batch_segment_ids)
+                batch_labels = sequence_padding(batch_labels)
+                yield [batch_token_ids, batch_segment_ids], batch_labels
+                batch_token_ids, batch_segment_ids, batch_labels = [], [], []
+
+
+"""
+后面的代码使用的是bert类型的模型，如果你用的是albert，那么前几行请改为：
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+    model='albert',
+)
+output_layer = 'Transformer-FeedForward-Norm'
+output = model.get_layer(output_layer).get_output_at(bert_layers - 1)
+"""
+
+model = build_transformer_model(
+    config_path,
+    checkpoint_path,
+)
+
+output_layer = 'Transformer-%s-FeedForward-Norm' % (bert_layers - 1)
+output = model.get_layer(output_layer).output
+output = Dense(len(categories) * 2 + 1)(output)
+CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier)
+output = CRF(output)
+
+model = Model(model.input, output)
+model.summary()
+
+model.compile(
+    loss=CRF.sparse_loss,
+    optimizer=Adam(learning_rate),
+    metrics=[CRF.sparse_accuracy]
+)
+
+
+class NamedEntityRecognizer(ViterbiDecoder):
+    """命名实体识别器
+    """
+    def recognize(self, text):
+        tokens = tokenizer.tokenize(text, maxlen=512)
+        mapping = tokenizer.rematch(text, tokens)
+        token_ids = tokenizer.tokens_to_ids(tokens)
+        segment_ids = [0] * len(token_ids)
+        token_ids, segment_ids = to_array([token_ids], [segment_ids])
+        nodes = model.predict([token_ids, segment_ids])[0]
+        labels = self.decode(nodes)
+        entities, starting = [], False
+        for i, label in enumerate(labels):
+            if label > 0:
+                if label % 2 == 1:
+                    starting = True
+                    entities.append([[i], categories[(label - 1) // 2]])
+                elif starting:
+                    entities[-1][0].append(i)
+                else:
+                    starting = False
+            else:
+                starting = False
+        return [(mapping[w[0]][0], mapping[w[-1]][-1], l) for w, l in entities]
+
+
+NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])
+
+
+def evaluate(data):
+    """评测函数
+    """
+    X, Y, Z = 1e-10, 1e-10, 1e-10
+    for d in tqdm(data, ncols=100):
+        R = set(NER.recognize(d[0]))
+        T = set([tuple(i) for i in d[1:]])
+        X += len(R & T)
+        Y += len(R)
+        Z += len(T)
+    f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
+    return f1, precision, recall
+
+
+class Evaluator(keras.callbacks.Callback):
+    """评估与保存
+    """
+    def __init__(self):
+        self.best_val_f1 = 0
+
+    def on_epoch_end(self, epoch, logs=None):
+        trans = K.eval(CRF.trans)
+        NER.trans = trans
+        print(NER.trans)
+        f1, precision, recall = evaluate(valid_data)
+        # 保存最优
+        if f1 >= self.best_val_f1:
+            self.best_val_f1 = f1
+            model.save_weights('./best_model.weights')
+        print(
+            'valid:  f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %
+            (f1, precision, recall, self.best_val_f1)
+        )
+        f1, precision, recall = evaluate(test_data)
+        print(
+            'test:  f1: %.5f, precision: %.5f, recall: %.5f\n' %
+            (f1, precision, recall)
+        )
+
+
+if __name__ == '__main__':
+
+    evaluator = Evaluator()
+    train_generator = data_generator(train_data, batch_size)
+
+    model.fit(
+        train_generator.forfit(),
+        steps_per_epoch=len(train_generator),
+        epochs=epochs,
+        callbacks=[evaluator]
+    )
+
+else:
+
+    model.load_weights('./best_model.weights')
+    NER.trans = K.eval(CRF.trans)
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/modelzoo_level.txt b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/modelzoo_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..31529da2e68f25b61e2a3e698a07537281443c03
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/modelzoo_level.txt
@@ -0,0 +1,3 @@
+FuncStatus:OK
+PerfStatus:OK
+PrecisionStatus:OK
\ No newline at end of file
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/pretraining/README.md b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/pretraining/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..db756b19a99f58719082f807ea7c07705b3caa9a
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/pretraining/README.md
@@ -0,0 +1,19 @@
+# 预训练相关代码
+
+目前支持RoBERTa和GPT模式的预训练。请在tensorflow 1.14或1.15下运行。
+
+## 使用
+```
+python data_utils.py # 生成tfrecord
+python pretraining.py # 启动预训练过程
+```
+
+请阅读`data_utils.py`和`pretraining.py`修改相应的配置和参数，以适配自己的语料和设备。
+
+## 背景
+
+keras是一个友好的框架，通常我们都是基于tf后端使用，另外还有tf.keras可以使用，基本上跟keras 2.3.x的接口一致了。
+
+这种一致性意味着使用keras几乎就相当于使用tf，这意味着tf的一切优势keras也有，但tf没有的优势（比如使用简便）keras也有。
+
+因此，作者参考原训练过程地实现了基于keras的预训练脚本，而有了这个keras版之后，因为前面所述的一致性，所以我们可以很轻松地迁移到多GPU上训练，也可以很轻松地迁移到TPU上训练。
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/pretraining/data_utils.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/pretraining/data_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a4fe6032e71dae283a69a673bc04b233a8eb5af
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/pretraining/data_utils.py
@@ -0,0 +1,437 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 预训练语料构建
+
+import os
+os.environ['TF_KERAS'] = '1'  # 必须使用tf.keras
+
+import numpy as np
+import tensorflow as tf
+from bert4keras.snippets import parallel_apply
+from bert4keras.backend import K
+
+
+class TrainingDataset(object):
+    """预训练数据集生成器
+    """
+    def __init__(self, tokenizer, sequence_length=512):
+        """参数说明：
+            tokenizer必须是bert4keras自带的tokenizer类；
+        """
+        self.tokenizer = tokenizer
+        self.sequence_length = sequence_length
+        self.token_pad_id = tokenizer._token_pad_id
+        self.token_cls_id = tokenizer._token_start_id
+        self.token_sep_id = tokenizer._token_end_id
+        self.token_mask_id = tokenizer._token_mask_id
+        self.vocab_size = tokenizer._vocab_size
+
+    def padding(self, sequence, padding_value=None):
+        """对单个序列进行补0
+        """
+        if padding_value is None:
+            padding_value = self.token_pad_id
+
+        sequence = sequence[:self.sequence_length]
+        padding_length = self.sequence_length - len(sequence)
+        return sequence + [padding_value] * padding_length
+
+    def sentence_process(self, text):
+        """单个文本的处理函数，返回处理后的instance
+        """
+        raise NotImplementedError
+
+    def paragraph_process(self, texts, starts, ends, paddings):
+        """单个段落（多个文本）的处理函数
+        说明：texts是单句组成的list；starts是每个instance的起始id；
+              ends是每个instance的终止id；paddings是每个instance的填充id。
+        做法：不断塞句子，直到长度最接近sequence_length，然后padding。
+        """
+        instances, instance = [], [[start] for start in starts]
+
+        for text in texts:
+            # 处理单个句子
+            sub_instance = self.sentence_process(text)
+            sub_instance = [i[:self.sequence_length - 2] for i in sub_instance]
+            new_length = len(instance[0]) + len(sub_instance[0])
+
+            # 如果长度即将溢出
+            if new_length > self.sequence_length - 1:
+                # 插入终止符，并padding
+                complete_instance = []
+                for item, end, pad in zip(instance, ends, paddings):
+                    item.append(end)
+                    item = self.padding(item, pad)
+                    complete_instance.append(item)
+                # 存储结果，并构建新样本
+                instances.append(complete_instance)
+                instance = [[start] for start in starts]
+
+            # 样本续接
+            for item, sub_item in zip(instance, sub_instance):
+                item.extend(sub_item)
+
+        # 插入终止符，并padding
+        complete_instance = []
+        for item, end, pad in zip(instance, ends, paddings):
+            item.append(end)
+            item = self.padding(item, pad)
+            complete_instance.append(item)
+
+        # 存储最后的instance
+        instances.append(complete_instance)
+
+        return instances
+
+    def tfrecord_serialize(self, instances, instance_keys):
+        """转为tfrecord的字符串，等待写入到文件
+        """
+        def create_feature(x):
+            return tf.train.Feature(int64_list=tf.train.Int64List(value=x))
+
+        serialized_instances = []
+        for instance in instances:
+            features = {
+                k: create_feature(v)
+                for k, v in zip(instance_keys, instance)
+            }
+            tf_features = tf.train.Features(feature=features)
+            tf_example = tf.train.Example(features=tf_features)
+            serialized_instance = tf_example.SerializeToString()
+            serialized_instances.append(serialized_instance)
+
+        return serialized_instances
+
+    def process(self, corpus, record_name, workers=8, max_queue_size=2000):
+        """处理输入语料（corpus），最终转为tfrecord格式（record_name）
+        自带多进程支持，如果cpu核心数多，请加大workers和max_queue_size。
+        """
+        writer = tf.io.TFRecordWriter(record_name)
+        globals()['count'] = 0
+
+        def write_to_tfrecord(serialized_instances):
+            globals()['count'] += len(serialized_instances)
+            for serialized_instance in serialized_instances:
+                writer.write(serialized_instance)
+
+        def paragraph_process(texts):
+            instances = self.paragraph_process(texts)
+            serialized_instances = self.tfrecord_serialize(instances)
+            return serialized_instances
+
+        parallel_apply(
+            func=paragraph_process,
+            iterable=corpus,
+            workers=workers,
+            max_queue_size=max_queue_size,
+            callback=write_to_tfrecord,
+        )
+
+        writer.close()
+        print('write %s examples into %s' % (count, record_name))
+
+    @staticmethod
+    def load_tfrecord(record_names, batch_size, parse_function):
+        """加载处理成tfrecord格式的语料
+        """
+        if not isinstance(record_names, list):
+            record_names = [record_names]
+
+        dataset = tf.data.TFRecordDataset(record_names)  # 加载
+        dataset = dataset.map(parse_function)  # 解析
+        dataset = dataset.repeat()  # 循环
+        dataset = dataset.shuffle(batch_size * 1000)  # 打乱
+        dataset = dataset.batch(batch_size)  # 成批
+
+        return dataset
+
+
+class TrainingDatasetRoBERTa(TrainingDataset):
+    """预训练数据集生成器（RoBERTa模式）
+    """
+    def __init__(
+        self, tokenizer, word_segment, mask_rate=0.15, sequence_length=512
+    ):
+        """参数说明：
+            tokenizer必须是bert4keras自带的tokenizer类；
+            word_segment是任意分词函数。
+        """
+        super(TrainingDatasetRoBERTa, self).__init__(tokenizer, sequence_length)
+        self.word_segment = word_segment
+        self.mask_rate = mask_rate
+
+    def token_process(self, token_id):
+        """以80%的几率替换为[MASK]，以10%的几率保持不变，
+        以10%的几率替换为一个随机token。
+        """
+        rand = np.random.random()
+        if rand <= 0.8:
+            return self.token_mask_id
+        elif rand <= 0.9:
+            return token_id
+        else:
+            return np.random.randint(0, self.vocab_size)
+
+    def sentence_process(self, text):
+        """单个文本的处理函数
+        流程：分词，然后转id，按照mask_rate构建全词mask的序列
+              来指定哪些token是否要被mask
+        """
+        words = self.word_segment(text)
+        rands = np.random.random(len(words))
+
+        token_ids, mask_ids = [], []
+        for rand, word in zip(rands, words):
+            word_tokens = self.tokenizer.tokenize(text=word)[1:-1]
+            word_token_ids = self.tokenizer.tokens_to_ids(word_tokens)
+            token_ids.extend(word_token_ids)
+
+            if rand < self.mask_rate:
+                word_mask_ids = [
+                    self.token_process(i) + 1 for i in word_token_ids
+                ]
+            else:
+                word_mask_ids = [0] * len(word_tokens)
+
+            mask_ids.extend(word_mask_ids)
+
+        return [token_ids, mask_ids]
+
+    def paragraph_process(self, texts):
+        """给原方法补上starts、ends、paddings
+        """
+        starts = [self.token_cls_id, 0]
+        ends = [self.token_sep_id, 0]
+        paddings = [self.token_pad_id, 0]
+        return super(TrainingDatasetRoBERTa,
+                     self).paragraph_process(texts, starts, ends, paddings)
+
+    def tfrecord_serialize(self, instances):
+        """给原方法补上instance_keys
+        """
+        instance_keys = ['token_ids', 'mask_ids']
+        return super(TrainingDatasetRoBERTa,
+                     self).tfrecord_serialize(instances, instance_keys)
+
+    @staticmethod
+    def load_tfrecord(record_names, sequence_length, batch_size):
+        """给原方法补上parse_function
+        """
+        def parse_function(serialized):
+            features = {
+                'token_ids': tf.io.FixedLenFeature([sequence_length], tf.int64),
+                'mask_ids': tf.io.FixedLenFeature([sequence_length], tf.int64),
+            }
+            features = tf.io.parse_single_example(serialized, features)
+            token_ids = features['token_ids']
+            mask_ids = features['mask_ids']
+            segment_ids = K.zeros_like(token_ids, dtype='int64')
+            is_masked = K.not_equal(mask_ids, 0)
+            masked_token_ids = K.switch(is_masked, mask_ids - 1, token_ids)
+            x = {
+                'Input-Token': masked_token_ids,
+                'Input-Segment': segment_ids,
+                'token_ids': token_ids,
+                'is_masked': K.cast(is_masked, K.floatx()),
+            }
+            y = {
+                'mlm_loss': K.zeros([1]),
+                'mlm_acc': K.zeros([1]),
+            }
+            return x, y
+
+        return TrainingDataset.load_tfrecord(
+            record_names, batch_size, parse_function
+        )
+
+
+class TrainingDatasetGPT(TrainingDataset):
+    """预训练数据集生成器（GPT模式，单向语言模型）
+    """
+    def sentence_process(self, text):
+        """单个文本的处理函数
+        流程：分词，然后转id。
+        """
+        tokens = self.tokenizer.tokenize(text=text)[1:-1]
+        token_ids = self.tokenizer.tokens_to_ids(tokens)
+        return [token_ids]
+
+    def paragraph_process(self, texts):
+        """给原方法补上starts、ends、paddings
+        """
+        starts = [self.token_cls_id]
+        ends = [self.token_sep_id]
+        paddings = [self.token_pad_id]
+        return super(TrainingDatasetGPT,
+                     self).paragraph_process(texts, starts, ends, paddings)
+
+    def tfrecord_serialize(self, instances):
+        """给原方法补上instance_keys
+        """
+        instance_keys = ['token_ids']
+        return super(TrainingDatasetGPT,
+                     self).tfrecord_serialize(instances, instance_keys)
+
+    @staticmethod
+    def load_tfrecord(record_names, sequence_length, batch_size):
+        """给原方法补上parse_function
+        """
+        def parse_function(serialized):
+            features = {
+                'token_ids': tf.io.FixedLenFeature([sequence_length], tf.int64),
+            }
+            features = tf.io.parse_single_example(serialized, features)
+            token_ids = features['token_ids']
+            segment_ids = K.zeros_like(token_ids, dtype='int64')
+            x = {
+                'Input-Token': token_ids,
+                'Input-Segment': segment_ids,
+            }
+            y = {
+                'lm_loss': K.zeros([1]),
+                'lm_acc': K.zeros([1]),
+            }
+            return x, y
+
+        return TrainingDataset.load_tfrecord(
+            record_names, batch_size, parse_function
+        )
+
+
+class TrainingDatasetUniLM(TrainingDatasetGPT):
+    """预训练数据集生成器（UniLM模式，Seq2Seq模型）
+    """
+    @staticmethod
+    def load_tfrecord(record_names, sequence_length, batch_size, token_sep_id):
+        """给原方法补上parse_function
+        """
+        def parse_function(serialized):
+            features = {
+                'token_ids': tf.io.FixedLenFeature([sequence_length], tf.int64),
+            }
+            features = tf.io.parse_single_example(serialized, features)
+            token_ids = features['token_ids']
+            segment = K.random_uniform(
+                shape=[1], minval=1, maxval=sequence_length - 1, dtype='int64'
+            )[0]
+            segment_ids = K.one_hot(segment + 1, sequence_length)
+            segment_ids = K.cast(K.cumsum(segment_ids), 'int64')
+            token_ids_1 = token_ids[:segment]
+            token_ids_2 = K.zeros([1], dtype='int64') + token_sep_id
+            token_ids_3 = token_ids[segment:-1]
+            token_ids = K.concatenate([token_ids_1, token_ids_2, token_ids_3])
+            x = {
+                'Input-Token': token_ids,
+                'Input-Segment': segment_ids,
+            }
+            y = {
+                'unilm_loss': K.zeros([1]),
+                'unilm_acc': K.zeros([1]),
+            }
+            return x, y
+
+        return TrainingDataset.load_tfrecord(
+            record_names, batch_size, parse_function
+        )
+
+
+if __name__ == '__main__':
+
+    from bert4keras.tokenizers import Tokenizer
+    import json, glob, re
+    from tqdm import tqdm
+
+    model = 'roberta'
+    sequence_length = 512
+    workers = 40
+    max_queue_size = 4000
+    dict_path = '/home/spaces_ac_cn/chinese_L-12_H-768_A-12/vocab.txt'
+    tokenizer = Tokenizer(dict_path, do_lower_case=True)
+
+    def some_texts():
+        filenames = glob.glob('/home/spaces_ac_cn/corpus/*/*/*')
+        np.random.shuffle(filenames)
+        count, texts = 0, []
+        for filename in filenames:
+            with open(filename) as f:
+                for l in f:
+                    l = json.loads(l)['text'].strip()
+                    texts.extend(re.findall(u'.*?[\n。]+', l))
+                    count += 1
+                    if count == 10:  # 10篇文章合在一起再处理
+                        yield texts
+                        count, texts = 0, []
+        if texts:
+            yield texts
+
+    assert model in ['roberta', 'gpt', 'unilm']  # 判断是否支持的模型类型
+
+    if model == 'roberta':
+
+        import jieba_fast as jieba
+        jieba.initialize()
+
+        def word_segment(text):
+            return jieba.lcut(text)
+
+        TD = TrainingDatasetRoBERTa(
+            tokenizer, word_segment, sequence_length=sequence_length
+        )
+
+        for i in range(10):  # 数据重复10遍
+            TD.process(
+                corpus=tqdm(some_texts()),
+                record_name='../corpus_tfrecord/corpus.%s.tfrecord' % i,
+                workers=workers,
+                max_queue_size=max_queue_size,
+            )
+
+    elif model == 'gpt':
+
+        TD = TrainingDatasetGPT(tokenizer, sequence_length=sequence_length)
+
+        TD.process(
+            corpus=tqdm(some_texts()),
+            record_name='../corpus_tfrecord/corpus.tfrecord',
+            workers=workers,
+            max_queue_size=max_queue_size,
+        )
+
+    elif model == 'unilm':
+
+        TD = TrainingDatasetUniLM(tokenizer, sequence_length=sequence_length)
+
+        TD.process(
+            corpus=tqdm(some_texts()),
+            record_name='../corpus_tfrecord/corpus.tfrecord',
+            workers=workers,
+            max_queue_size=max_queue_size,
+        )
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/pretraining/pretraining.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/pretraining/pretraining.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb27051b3014026750f6f08bdf5aec8258a406f0
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/pretraining/pretraining.py
@@ -0,0 +1,358 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+# 预训练脚本，多GPU版/TPU版本
+
+import os, re
+os.environ['TF_KERAS'] = '1'  # 必须使用tf.keras
+
+import tensorflow as tf
+from data_utils import *
+from bert4keras.models import build_transformer_model
+from bert4keras.backend import keras, K
+from bert4keras.optimizers import Adam
+from bert4keras.optimizers import extend_with_weight_decay
+from bert4keras.optimizers import extend_with_layer_adaptation
+from bert4keras.optimizers import extend_with_piecewise_linear_lr
+from bert4keras.optimizers import extend_with_gradient_accumulation
+from keras.layers import Input, Lambda
+from keras.models import Model
+from keras.callbacks import Callback, CSVLogger
+
+model = 'roberta'
+
+# 语料路径和模型保存路径
+# 如果是TPU训练，那么语料必须存放在Google Cloud Storage上面，
+# 路径必须以gs://开头；如果是GPU训练，改为普通路径即可。
+model_saved_path = 'gs://xxxx/bert4keras/saved_model/bert_model.ckpt'
+corpus_paths = [
+    'gs://xxxx/bert4keras/corpus/corpus.%s.tfrecord' % i for i in range(10)
+]
+
+# 其他配置
+sequence_length = 512
+batch_size = 4096
+config_path = '/home/spaces_ac_cn/chinese_L-12_H-768_A-12/bert_config.json'
+checkpoint_path = '/home/spaces_ac_cn/chinese_L-12_H-768_A-12/bert_model.ckpt'  # 如果从零训练，就设为None
+learning_rate = 0.00176
+weight_decay_rate = 0.01
+num_warmup_steps = 3125
+num_train_steps = 125000
+steps_per_epoch = 10000
+grad_accum_steps = 16  # 大于1即表明使用梯度累积
+epochs = num_train_steps * grad_accum_steps // steps_per_epoch
+exclude_from_weight_decay = ['Norm', 'bias']
+exclude_from_layer_adaptation = ['Norm', 'bias']
+tpu_address = 'grpc://xxx.xxx.xxx.xxx:8470'  # 如果用多GPU跑，直接设为None
+which_optimizer = 'lamb'  # adam 或 lamb，均自带weight decay
+lr_schedule = {
+    num_warmup_steps * grad_accum_steps: 1.0,
+    num_train_steps * grad_accum_steps: 0.0,
+}
+floatx = K.floatx()
+
+# 读取数据集，构建数据张量
+
+if model == 'roberta':
+
+    dataset = TrainingDatasetRoBERTa.load_tfrecord(
+        record_names=corpus_paths,
+        sequence_length=sequence_length,
+        batch_size=batch_size // grad_accum_steps,
+    )
+
+elif model == 'gpt':
+
+    dataset = TrainingDatasetGPT.load_tfrecord(
+        record_names=corpus_paths,
+        sequence_length=sequence_length,
+        batch_size=batch_size // grad_accum_steps,
+    )
+
+elif model == 'unilm':
+
+    dataset = TrainingDatasetUniLM.load_tfrecord(
+        record_names=corpus_paths,
+        sequence_length=sequence_length,
+        batch_size=batch_size // grad_accum_steps,
+        token_sep_id=3,  # 这里需要自己指定[SEP]的id
+    )
+
+
+def build_transformer_model_with_mlm():
+    """带mlm的bert模型
+    """
+    bert = build_transformer_model(
+        config_path, with_mlm='linear', return_keras_model=False
+    )
+    proba = bert.model.output
+
+    # 辅助输入
+    token_ids = Input(shape=(None,), dtype='int64', name='token_ids')  # 目标id
+    is_masked = Input(shape=(None,), dtype=floatx, name='is_masked')  # mask标记
+
+    def mlm_loss(inputs):
+        """计算loss的函数，需要封装为一个层
+        """
+        y_true, y_pred, mask = inputs
+        loss = K.sparse_categorical_crossentropy(
+            y_true, y_pred, from_logits=True
+        )
+        loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
+        return loss
+
+    def mlm_acc(inputs):
+        """计算准确率的函数，需要封装为一个层
+        """
+        y_true, y_pred, mask = inputs
+        y_true = K.cast(y_true, floatx)
+        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
+        acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
+        return acc
+
+    mlm_loss = Lambda(mlm_loss, name='mlm_loss')([token_ids, proba, is_masked])
+    mlm_acc = Lambda(mlm_acc, name='mlm_acc')([token_ids, proba, is_masked])
+
+    train_model = Model(
+        bert.model.inputs + [token_ids, is_masked], [mlm_loss, mlm_acc]
+    )
+
+    loss = {
+        'mlm_loss': lambda y_true, y_pred: y_pred,
+        'mlm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
+    }
+
+    return bert, train_model, loss
+
+
+def build_transformer_model_with_lm():
+    """带lm的bert模型
+    """
+    bert = build_transformer_model(
+        config_path,
+        with_mlm='linear',
+        application='lm',
+        return_keras_model=False
+    )
+    token_ids = bert.model.inputs[0]
+    proba = bert.model.output
+
+    def lm_loss(inputs, mask=None):
+        """计算loss的函数，需要封装为一个层
+        """
+        y_true, y_pred = inputs
+        y_true, y_pred = y_true[:, 1:], y_pred[:, :-1]
+
+        if mask is None:
+            mask = 1.0
+        else:
+            mask = K.cast(mask[1][:, 1:], floatx)
+
+        loss = K.sparse_categorical_crossentropy(
+            y_true, y_pred, from_logits=True
+        )
+        loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
+        return loss
+
+    def lm_acc(inputs, mask=None):
+        """计算准确率的函数，需要封装为一个层
+        """
+        y_true, y_pred = inputs
+        y_true, y_pred = K.cast(y_true[:, 1:], floatx), y_pred[:, :-1]
+
+        if mask is None:
+            mask = 1.0
+        else:
+            mask = K.cast(mask[1][:, 1:], floatx)
+
+        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
+        acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
+        return acc
+
+    lm_loss = Lambda(lm_loss, name='lm_loss')([token_ids, proba])
+    lm_acc = Lambda(lm_acc, name='lm_acc')([token_ids, proba])
+
+    train_model = Model(bert.model.inputs, [lm_loss, lm_acc])
+
+    loss = {
+        'lm_loss': lambda y_true, y_pred: y_pred,
+        'lm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
+    }
+
+    return bert, train_model, loss
+
+
+def build_transformer_model_with_unilm():
+    """带unilm的bert模型
+    """
+    bert = build_transformer_model(
+        config_path,
+        with_mlm='linear',
+        application='unilm',
+        return_keras_model=False
+    )
+    token_ids = bert.model.inputs[0]
+    segment_ids = bert.model.inputs[1]
+    proba = bert.model.output
+
+    def unilm_loss(inputs, mask=None):
+        """计算loss的函数，需要封装为一个层
+        """
+        y_true, y_pred, segment_ids = inputs
+        y_true, y_pred = y_true[:, 1:], y_pred[:, :-1]
+
+        if mask is None:
+            mask = 1.0
+        else:
+            mask = K.cast(mask[1][:, 1:], floatx)
+
+        segment_ids = K.cast(segment_ids, floatx)
+        mask = mask * segment_ids[:, 1:]
+
+        loss = K.sparse_categorical_crossentropy(
+            y_true, y_pred, from_logits=True
+        )
+        loss = K.sum(loss * mask) / (K.sum(mask) + K.epsilon())
+        return loss
+
+    def unilm_acc(inputs, mask=None):
+        """计算准确率的函数，需要封装为一个层
+        """
+        y_true, y_pred, segment_ids = inputs
+        y_true, y_pred = K.cast(y_true[:, 1:], floatx), y_pred[:, :-1]
+
+        if mask is None:
+            mask = 1.0
+        else:
+            mask = K.cast(mask[1][:, 1:], floatx)
+
+        segment_ids = K.cast(segment_ids, floatx)
+        mask = mask * segment_ids[:, 1:]
+
+        acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
+        acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
+        return acc
+
+    token_proba_segment = [token_ids, proba, segment_ids]
+    unilm_loss = Lambda(unilm_loss, name='unilm_loss')(token_proba_segment)
+    unilm_acc = Lambda(unilm_acc, name='unilm_acc')(token_proba_segment)
+
+    train_model = Model(bert.model.inputs, [unilm_loss, unilm_acc])
+
+    loss = {
+        'unilm_loss': lambda y_true, y_pred: y_pred,
+        'unilm_acc': lambda y_true, y_pred: K.stop_gradient(y_pred),
+    }
+
+    return bert, train_model, loss
+
+
+def build_transformer_model_for_pretraining():
+    """构建训练模型，通用于TPU/GPU
+    注意全程要用keras标准的层写法，一些比较灵活的“移花接木”式的
+    写法可能会在TPU上训练失败。此外，要注意的是TPU并非支持所有
+    tensorflow算子，尤其不支持动态（变长）算子，因此编写相应运算
+    时要格外留意。
+    """
+    if model == 'roberta':
+        bert, train_model, loss = build_transformer_model_with_mlm()
+    elif model == 'gpt':
+        bert, train_model, loss = build_transformer_model_with_lm()
+    elif model == 'unilm':
+        bert, train_model, loss = build_transformer_model_with_unilm()
+
+    # 优化器
+    optimizer = extend_with_weight_decay(Adam)
+    if which_optimizer == 'lamb':
+        optimizer = extend_with_layer_adaptation(optimizer)
+    optimizer = extend_with_piecewise_linear_lr(optimizer)
+    optimizer_params = {
+        'learning_rate': learning_rate,
+        'lr_schedule': lr_schedule,
+        'weight_decay_rate': weight_decay_rate,
+        'exclude_from_weight_decay': exclude_from_weight_decay,
+        'exclude_from_layer_adaptation': exclude_from_layer_adaptation,
+        'bias_correction': False,
+    }
+    if grad_accum_steps > 1:
+        optimizer = extend_with_gradient_accumulation(optimizer)
+        optimizer_params['grad_accum_steps'] = grad_accum_steps
+    optimizer = optimizer(**optimizer_params)
+
+    # 模型定型
+    train_model.compile(loss=loss, optimizer=optimizer)
+
+    # 如果传入权重，则加载。注：须在此处加载，才保证不报错。
+    if checkpoint_path is not None:
+        bert.load_weights_from_checkpoint(checkpoint_path)
+
+    return train_model
+
+
+if tpu_address is None:
+    # 单机多卡模式（多机多卡也类似，但需要硬软件配合，请参考https://tf.wiki）
+    strategy = tf.distribute.MirroredStrategy()
+else:
+    # TPU模式
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+        tpu=tpu_address
+    )
+    tf.config.experimental_connect_to_host(resolver.master())
+    tf.tpu.experimental.initialize_tpu_system(resolver)
+    strategy = tf.distribute.experimental.TPUStrategy(resolver)
+
+with strategy.scope():
+    train_model = build_transformer_model_for_pretraining()
+    train_model.summary()
+
+
+class ModelCheckpoint(keras.callbacks.Callback):
+    """自动保存最新模型
+    """
+    def on_epoch_end(self, epoch, logs=None):
+        # model.save_weights 保存的模型，用 model.load_weights 加载
+        # bert.save_weights_as_checkpoint 保存的模型，用 bert.load_weights_from_checkpoint 加载
+        # 不要问为什么保存的模型用 build_transformer_model 加载不了
+        # 先搞清楚对应情况，build_transformer_model 是用 load_weights_from_checkpoint 加载的。
+        self.model.save_weights(model_saved_path, overwrite=True)
+
+
+# 保存模型
+checkpoint = ModelCheckpoint()
+# 记录日志
+csv_logger = keras.callbacks.CSVLogger('training.log')
+
+# 模型训练
+train_model.fit(
+    dataset,
+    steps_per_epoch=steps_per_epoch,
+    epochs=epochs,
+    callbacks=[checkpoint, csv_logger],
+)
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/setup.py b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..622042f107cdab03b8f93254044b516f1fc59025
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/setup.py
@@ -0,0 +1,45 @@
+#
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#! -*- coding: utf-8 -*-
+
+from setuptools import setup, find_packages
+
+setup(
+    name='bert4keras',
+    version='0.11.3',
+    description='an elegant bert4keras',
+    long_description='bert4keras: https://github.com/bojone/bert4keras',
+    license='Apache License 2.0',
+    url='https://github.com/bojone/bert4keras',
+    author='bojone',
+    author_email='bojone@spaces.ac.cn',
+    install_requires=['keras<=2.3.1'],
+    packages=find_packages()
+)
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/test/.keep b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/test/.keep
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/test/train_full_1p.sh b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/test/train_full_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f964e506b8409efc314322d995b7a542ae014076
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/test/train_full_1p.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+#set -x
+#export TF_KERAS=1
+cur_path=`pwd`
+#export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#export ASCEND_SLOG_PRINT_TO_STDOUT=1
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="Bertbase_ID3444_for_TensorFlow"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+# 训练epoch
+train_epochs=1
+#训练步数
+#train_steps=100
+# 图片大小
+#image_size=640
+# 指定训练所使用的npu device卡id
+export NPU_CALCULATE_DEVICE=$ASCEND_DEVICE_ID
+export PYTHONPATH=${cur_path}/../:$PYTHONPATH
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+
+#################创建日志输出目录，不需要修改#################
+#创建DeviceID输出目录，不需要修改
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+else
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+fi
+
+for para in $*
+do
+    if [[ $para == --ckpt_path* ]];then
+        ckpt_path=`echo ${para#*=}`
+    fi
+done
+
+
+#################启动训练脚本#################
+# 训练开始时间，不需要修改
+cd $cur_path/../
+sed -i "s|chinese_L-12_H-768_A-12/bert_config.json|${ckpt_path}/bert_config.json|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|chinese_L-12_H-768_A-12/bert_model.ckpt|${ckpt_path}/bert_model.ckpt|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|chinese_L-12_H-768_A-12/vocab.txt|${ckpt_path}/vocab.txt|g" examples/task_sentence_similarity_lcqmc.py
+
+sed -i "s|datasets/lcqmc/lcqmc.train.data|${data_path}/lcqmc/lcqmc.train.data|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|datasets/lcqmc/lcqmc.valid.data|${data_path}/lcqmc/lcqmc.valid.data|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|datasets/lcqmc/lcqmc.test.data|${data_path}/lcqmc/lcqmc.test.data|g" examples/task_sentence_similarity_lcqmc.py
+
+#sed -i "s|chinese_L-12_H-768_A-12/bert_config.json|${ckpt_path}/bert_config.json|g" examples/task_sentence_similarity_lcqmc.py
+#sed -i "s|datasets/lcqmc/lcqmc.train.data|${data_path}/datasets/lcqmc/lcqmc.train.data|g" examples/task_sentence_similarity_lcqmc.py
+#sed -i "s|max_epochs=273|max_epochs=${train_epochs}|g" examples/task_sentence_similarity_lcqmc.py
+
+
+wait
+start_time=$(date +%s)
+
+
+nohup python3 ./examples/task_sentence_similarity_lcqmc.py  > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#参数复原
+sed -i "s|${ckpt_path}/bert_config.json|chinese_L-12_H-768_A-12/bert_config.json|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|${ckpt_path}/bert_model.ckpt|chinese_L-12_H-768_A-12/bert_model.ckpt|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|${ckpt_path}/vocab.txt|chinese_L-12_H-768_A-12/vocab.txt|g" examples/task_sentence_similarity_lcqmc.py
+
+sed -i "s|${data_path}/lcqmc/lcqmc.train.data|datasets/lcqmc/lcqmc.train.data|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|${data_path}/lcqmc/lcqmc.valid.data|datasets/lcqmc/lcqmc.valid.data|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|${data_path}/lcqmc/lcqmc.test.data|datasets/lcqmc/lcqmc.test.data|g" examples/task_sentence_similarity_lcqmc.py
+wait
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+TrainingTime=`grep "loss" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log |awk -F "ms/step" '{print $1}'|awk -F "s" '{print $2}'|tail -n+3|awk '{sum+=$1}END {print"",sum/NR}'`
+
+FPS=`python3 -c "print(${batch_size}/${TrainingTime}*1000)"`
+
+
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep "final test acc:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F " " '{print $4}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+
+###性能看护汇总
+BatchSize=${batch_size}
+#设备类型，自动获取
+DeviceType=`uname -m`
+#用例名称，自动获取
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'acc'
+
+#获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep "loss" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log |awk -F "loss" '{print $2}'|awk -F " " '{print $2}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+
+#生成csv
+
diff --git a/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/test/train_performance_1p.sh b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/test/train_performance_1p.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e1c371bb9fc3f30e57266ef03723197dd17e9972
--- /dev/null
+++ b/TensorFlow/built-in/nlp/Bertbase_ID3444_for_TensorFlow/test/train_performance_1p.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+#set -x
+#export TF_KERAS=1
+cur_path=`pwd`
+#export ASCEND_SLOG_PRINT_TO_STDOUT=0
+#export ASCEND_SLOG_PRINT_TO_STDOUT=1
+################基础配置参数，需要模型审视修改##################
+# 必选字段(必须在此处定义的参数): Network batch_size RANK_SIZE
+# 网络名称，同目录名称
+Network="Bertbase_ID3444_for_TensorFlow"
+# 训练batch_size
+batch_size=64
+# 训练使用的npu卡数
+RANK_SIZE=1
+# 数据集路径,保持为空,不需要修改
+data_path=""
+# 训练epoch
+train_epochs=1
+#训练步数
+#train_steps=100
+# 图片大小
+#image_size=640
+# 指定训练所使用的npu device卡id
+export NPU_CALCULATE_DEVICE=$ASCEND_DEVICE_ID
+export PYTHONPATH=${cur_path}/../:$PYTHONPATH
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    fi
+done
+
+# 校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+
+
+#################创建日志输出目录，不需要修改#################
+#创建DeviceID输出目录，不需要修改
+if [ -d ${cur_path}/output/${ASCEND_DEVICE_ID} ];then
+    rm -rf ${cur_path}/output/${ASCEND_DEVICE_ID}
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+else
+    mkdir -p ${cur_path}/output/$ASCEND_DEVICE_ID/ckpt
+fi
+
+for para in $*
+do
+    if [[ $para == --ckpt_path* ]];then
+        ckpt_path=`echo ${para#*=}`
+    fi
+done
+
+
+#################启动训练脚本#################
+# 训练开始时间，不需要修改
+cd $cur_path/../
+sed -i "s|chinese_L-12_H-768_A-12/bert_config.json|${ckpt_path}/bert_config.json|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|chinese_L-12_H-768_A-12/bert_model.ckpt|${ckpt_path}/bert_model.ckpt|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|chinese_L-12_H-768_A-12/vocab.txt|${ckpt_path}/vocab.txt|g" examples/task_sentence_similarity_lcqmc.py
+
+sed -i "s|datasets/lcqmc/lcqmc.train.data|${data_path}/lcqmc/lcqmc.train.data|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|datasets/lcqmc/lcqmc.valid.data|${data_path}/lcqmc/lcqmc.valid.data|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|datasets/lcqmc/lcqmc.test.data|${data_path}/lcqmc/lcqmc.test.data|g" examples/task_sentence_similarity_lcqmc.py
+
+#sed -i "s|chinese_L-12_H-768_A-12/bert_config.json|${ckpt_path}/bert_config.json|g" examples/task_sentence_similarity_lcqmc.py
+#sed -i "s|datasets/lcqmc/lcqmc.train.data|${data_path}/datasets/lcqmc/lcqmc.train.data|g" examples/task_sentence_similarity_lcqmc.py
+#sed -i "s|max_epochs=273|max_epochs=${train_epochs}|g" examples/task_sentence_similarity_lcqmc.py
+
+
+wait
+start_time=$(date +%s)
+
+
+nohup python3 ./examples/task_sentence_similarity_lcqmc.py  > ${cur_path}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#参数复原
+sed -i "s|${ckpt_path}/bert_config.json|chinese_L-12_H-768_A-12/bert_config.json|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|${ckpt_path}/bert_model.ckpt|chinese_L-12_H-768_A-12/bert_model.ckpt|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|${ckpt_path}/vocab.txt|chinese_L-12_H-768_A-12/vocab.txt|g" examples/task_sentence_similarity_lcqmc.py
+
+sed -i "s|${data_path}/lcqmc/lcqmc.train.data|datasets/lcqmc/lcqmc.train.data|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|${data_path}/lcqmc/lcqmc.valid.data|datasets/lcqmc/lcqmc.valid.data|g" examples/task_sentence_similarity_lcqmc.py
+sed -i "s|${data_path}/lcqmc/lcqmc.test.data|datasets/lcqmc/lcqmc.test.data|g" examples/task_sentence_similarity_lcqmc.py
+wait
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+TrainingTime=`grep "loss" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log |awk -F "ms/step" '{print $1}'|awk -F "s" '{print $2}'|tail -n+3|awk '{sum+=$1}END {print"",sum/NR}'`
+
+FPS=`python3 -c "print(${batch_size}/${TrainingTime}*1000)"`
+
+
+#打印，不需要修改
+echo "Final Performance images/sec : $FPS"
+
+#输出训练精度,需要模型审视修改
+train_accuracy=`grep "final test acc:" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log|awk -F " " '{print $4}'`
+#打印，不需要修改
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+
+###性能看护汇总
+BatchSize=${batch_size}
+#设备类型，自动获取
+DeviceType=`uname -m`
+#用例名称，自动获取
+CaseName=${Network}_bs${BatchSize}_${RANK_SIZE}'p'_'perf'
+
+#获取性能数据，不需要修改
+#吞吐量
+ActualFPS=${FPS}
+
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep "loss" $cur_path/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log |awk -F "loss" '{print $2}'|awk -F " " '{print $2}' > $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+#最后一个迭代loss值，不需要修改
+ActualLoss=`awk 'END {print}' $cur_path/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "BatchSize = ${BatchSize}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualFPS = ${ActualFPS}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}" >> $cur_path/output/$ASCEND_DEVICE_ID/${CaseName}.log
+
+#生成csv
+