From 0d6cb73036fb404c85e4a545ab3876bbccd65441 Mon Sep 17 00:00:00 2001
From: wuxiankun <wuxiankun@huawei.com>
Date: Wed, 5 Jul 2023 13:32:50 +0800
Subject: [PATCH 1/2] add npu adapt code

---
 .../built-in/nlp/Deltalm_for_PyTorch/LICENSE  |  30 ++
 .../nlp/Deltalm_for_PyTorch/README.md         | 342 +++++++--------
 .../nlp/Deltalm_for_PyTorch/README_RAW.md     | 184 ++++++++
 .../cuda/ngram_repeat_block_cuda_kernel.cu    |  82 ----
 .../fairseq/clib/libnat_cuda/edit_dist.cu     | 344 ---------------
 .../fairseq/fairseq/modules/cuda_utils.cu     | 202 ---------
 .../dynamicconv_cuda_kernel.cu                | 176 --------
 .../lightconv_layer/lightconv_cuda_kernel.cu  | 400 ------------------
 .../fairseq/optim/dynamic_loss_scaler.py      |  15 +-
 .../fairseq/fairseq/sequence_generator.py     |   2 +-
 .../fairseq/fairseq/tasks/fairseq_task.py     |   1 +
 .../fairseq/fairseq/trainer.py                |   7 +-
 .../fairseq/fairseq/utils.py                  |   3 -
 .../fairseq/fairseq_cli/train.py              |   3 +-
 .../nlp/Deltalm_for_PyTorch/requirements.txt  |   3 +
 .../nlp/Deltalm_for_PyTorch/test/env_npu.sh   |  31 ++
 .../Deltalm_for_PyTorch/test/train_full_8p.sh | 136 ++++++
 .../test/train_performance_1p.sh              | 136 ++++++
 .../test/train_performance_8p.sh              | 132 ++++++
 .../built-in/nlp/Deltalm_for_PyTorch/train.py |  37 +-
 20 files changed, 864 insertions(+), 1402 deletions(-)
 create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/LICENSE
 create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/README_RAW.md
 delete mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu
 delete mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/libnat_cuda/edit_dist.cu
 delete mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/cuda_utils.cu
 delete mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
 delete mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
 create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/requirements.txt
 create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/env_npu.sh
 create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_full_8p.sh
 create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_1p.sh
 create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_8p.sh

diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/LICENSE b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/LICENSE
new file mode 100644
index 0000000000..92a2f682ce
--- /dev/null
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/LICENSE
@@ -0,0 +1,30 @@
+BSD 3-Clause License
+
+Copyright (c) 2023,
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README.md b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README.md
index a90733ad4d..a824d178e6 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README.md
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README.md
@@ -1,184 +1,160 @@
-# [DeltaLM](https://arxiv.org/abs/2106.13736)
+# Deltalm for PyTorch
 
-**Encoder-Decoder Pre-training for Language Generation and Translation** 
-
-[DeltaLM: Encoder-Decoder Pre-training for Language Generation and Translation by Augmenting Pretrained Multilingual Encoders.](https://arxiv.org/abs/2106.13736) Shuming Ma, Li Dong, Shaohan Huang, Dongdong Zhang, Alexandre Muzio, Saksham Singhal, Hany Hassan Awadalla, Xia Song, Furu Wei. CoRR abs/2106.13736.
-
-[mT6: Multilingual Pretrained Text-to-Text Transformer with Translation Pairs.](https://arxiv.org/abs/2104.08692) Zewen Chi, Li Dong, Shuming Ma, Shaohan Huang, Xian-Ling Mao, Heyan Huang, and Furu Wei. In EMNLP 2021.
-
-- September 2021: DeltaLM ranks first on the [WMT21 multilingual translation task](http://www.statmt.org/wmt21/large-scale-multilingual-translation-task.html).
-- August 2021: release code and pretrained checkpoints.
-
----
-
-## Pretrained Models
-
-- [DeltaLM-base](https://deltalm.blob.core.windows.net/deltalm/deltalm-base.pt): #enc-dec=12-6; #hidden=768; #head=12; #FFN=3072 (#parameters: 360M)
-- [DeltaLM-large](https://deltalm.blob.core.windows.net/deltalm/deltalm-large.pt): #enc-dec=24-12; #hidden=1024; #head=16; #FFN=4096 (#parameters: 830M)
-- [Vocabulary](https://deltalm.blob.core.windows.net/deltalm/dict.txt) and [Sentencepiece-model](https://deltalm.blob.core.windows.net/deltalm/spm.model)
-- DeltaLM can be finetuned to support language generation and translation tasks for **100+ languages**
-
-
-## Cross-lingual Abstractive Summarization - [Wikilingua](https://arxiv.org/abs/2010.03093)
-
-We evaluate DeltaLM on cross-lingual abstractive summarization benchmark. We report the results by averaging the numbers in different languages. 
-
-|   Model   |   #Params   |  ROUGE-1  |  ROUGE-2  |  ROUGE-L  |
-|-----------|-------------|-----------|-----------|-----------|
-| [mBART](https://arxiv.org/abs/2001.08210)     | 610M        | 34.5      | 12.9      | **28.7**      |
-| [mT5](https://arxiv.org/abs/2010.11934)       | 300M        | 27.5      | 8.8       | 22.8      |
-| [mT5](https://arxiv.org/abs/2010.11934)       | 580M        | 31.8      | 11.5      | 26.0      |
-| DeltaLM   | 360M        | **35.3**      | **13.4**      | **28.7**      |
-
-
-## Setup
-
-```bash
-git submodule update --init deltalm/fairseq
-cd deltalm/
-pip install --editable fairseq/
-```
-
-## Fine-tuning
-
-1. Organize the raw data in the following structure:
-```
-.
-+-- /path/to/data/
-|   +-- train.src
-|   +-- train.tgt
-|   +-- valid.src
-|   +-- valid.tgt
-```
-
-*Examples (IWSLT14 German to English)*:
-```bash
-bash examples/prepare_iwslt14.sh /tmp/iwslt14
-```
-
-2. Tokenize the data using [Sentencepiece](https://github.com/google/sentencepiece):
-
-```bash
-spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < train.src > train.spm.src
-spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < train.tgt > train.spm.tgt
-spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < valid.src > valid.spm.src
-spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < valid.tgt > valid.spm.tgt
-spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < test.src > test.spm.src
-spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < test.tgt > test.spm.tgt
-```
-
-*Examples (IWSLT14 German to English)*:
-```bash
-bash examples/binary_iwslt14.sh \
-     /tmp/iwslt14/iwslt14.tokenized.de-en \
-     /tmp/iwslt14/iwslt14.spm \
-     /path/to/checkpoint/spm.model
-```
-
-3. Binary the data:
-
-```bash
-data_bin=/path/to/data-bin/
-python preprocess.py  \
-    --trainpref train.spm \
-    --validpref valid.spm \
-    --testpref test.spm \
-    --source-lang src --target-lang tgt \
-    --destdir $data_bin \
-    --srcdict /path/to/checkpoint/dict.txt \
-    --tgtdict /path/to/checkpoint/dict.txt \
-    --workers 40
-```
-
-*Examples (IWSLT14 German to English)*:
-```bash
-bash examples/binary_iwslt14.sh \
-     /tmp/iwslt14/iwslt14.spm \
-     /tmp/iwslt14/iwslt14.bin \
-     /path/to/checkpoint/dict.txt
-```
-
-4. Fine-tuning:
-
-```bash
-PRETRAINED_MODEL=/path/to/checkpoint/model.pt
-python train.py $data_bin \
-    --save-dir $save_dir \
-    --arch deltalm_base \
-    --pretrained-deltalm-checkpoint $PRETRAINED_MODEL \
-    --share-all-embeddings \
-    --max-source-positions 512 --max-target-positions 512 \
-    --criterion label_smoothed_cross_entropy \
-    --label-smoothing 0.1 \
-    --optimizer adam --adam-betas '(0.9, 0.98)' \
-    --lr-scheduler inverse_sqrt \
-    --lr $lr \
-    --warmup-init-lr 1e-07 \
-    --stop-min-lr 1e-09 \
-    --warmup-updates 4000 \
-    --max-update 400000 \
-    --max-epoch 100 \
-    --max-tokens $batch_size \
-    --update-freq 1 \
-    --seed 1 \
-    --log-format simple \
-    --skip-invalid-size-inputs-valid-test
-```
-**Note: 
-- For large checkpoint, please set `--arch deltalm_large`.
-- Please adjust the `max-tokens` and `update-freq` to suit in different experimental environments. Recommendation of the total batch size is `4096 * 128` tokens per step.
-- Use `--fp16` for more efficient training on the devices that have Tensor Cores.
-
-*Examples (IWSLT14 German to English)*:
-```bash
-bash examples/train_iwslt14.sh \
-     /tmp/iwslt14/iwslt14.bin \
-     /tmp/iwslt14/checkpoints \
-     /path/to/checkpoint/model.pt
-```
-
-5. Evaluation:
-
-```bash
-python generate.py $data_bin \
-    --path $save_dir/checkpoint_best.pt \
-    --batch-size 128 --beam 5 --remove-bpe=sentencepiece
-```
-
-*Examples (IWSLT14 German to English)*:
-```bash
-bash examples/evaluate_iwslt14.sh \
-     /tmp/iwslt14/iwslt14.bin \
-     /tmp/iwslt14/checkpoints
-```
-
----
-
-## Citation
-
-If you find this repository useful, please consider citing our work:
-```
-@article{deltalm,
-      title={{DeltaLM}: Encoder-Decoder Pre-training for Language Generation and Translation by Augmenting Pretrained Multilingual Encoders}, 
-      author={Shuming Ma and Li Dong and Shaohan Huang and Dongdong Zhang and Alexandre Muzio and Saksham Singhal and Hany Hassan Awadalla and Xia Song and Furu Wei},
-      year={2021},
-      eprint={2106.13736},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
-}
-```
-
-## Acknowledgement
-
-This repository is built using the [Fairseq](https://github.com/pytorch/fairseq) repository.
-
-## License
-This project is licensed under the license found in the LICENSE file in the root directory of this source tree.
-
-[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
-
-### Contact Information
-
-For help or issues using DeltaLM models, please submit a GitHub issue.
-
-For other communications related to DeltaLM, please contact Shuming Ma (`shumma@microsoft.com`), [Furu Wei](http://gitnlp.org/) (`fuwei@microsoft.com`).
+- [概述](概述.md)
+
+- [准备训练环境](准备训练环境.md)
+
+- [开始训练](开始训练.md)
+
+- [训练结果展示](训练结果展示.md)
+
+- [版本说明](版本说明.md)
+
+
+# 概述
+
+## 简述
+
+Deltalm 模型是Fairseq套件中基于Transformer结构的翻译模型，在iwslt14 de2en数据集上训练和评估。
+
+- 参考实现：
+
+  ```
+  url=https://github.com/microsoft/unilm/blob/master/deltalm
+  commit_id=eb1cc35e63988b2fe8c1fae348012a57da096e43
+  ```
+
+- 适配昇腾 AI 处理器的实现：
+
+  ```
+  url=https://gitee.com/ascend/ModelZoo-PyTorch.git
+  code_path=PyTorch/built-in/nlp
+  ```
+
+
+# 准备训练环境
+
+## 准备环境
+
+- 当前模型支持的 PyTorch 版本和已知三方库依赖如下表所示。
+
+  **表 1**  版本支持表
+
+  | Torch_Version      | 三方库依赖版本                                 |
+  | :--------: | :----------------------------------------------------------: |
+  | PyTorch 1.8 | - |
+
+- 环境准备指导。
+
+  请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。
+
+- 安装套件。
+
+  在模型源码包根目录下执行以下命令。
+  ```bash
+  pip3.7 install -e ./fairseq
+  ```
+  安装相应库
+  ```
+  pip install -r requirements.txt
+  ```
+
+
+## 准备数据集
+
+1. 获取数据集。
+
+    1. 用户可参考源码GPU仓自行下载 `iwslt14` 数据集，并在预处理数据后，上传至到服务器任意目录中，如`/data-bin`
+    2. 或者使用一键式处理工具`auto-data.sh`，需提前准备：
+       1. tokenize模型："https://deltalm.blob.core.windows.net/deltalm/spm.model"
+       2. 准备数据词典："https://deltalm.blob.core.windows.net/deltalm/dict.txt"
+       3. 准备分词工具：参考"https://github.com/google/sentencepiece" readme操作安装`spm_encode `
+       4. 执行脚本`bash auto-data.sh $1 $2 $3 $4 $5 $6`
+
+          $1：原始数据生成目录 `/tmp/iwslt14`
+
+          $2：最终处理数据目录 `/data-bin`
+
+          $3：tokenize模型路径
+
+          $4：词典路径
+          
+          $5: 数据预处理工具下载链接: [mosesdecoder](https://github.com/moses-smt/mosesdecoder.git)
+          
+          $6: 原始数据下载链接: [iwslt14](http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz)
+
+2. 获取预训练模型
+  用户自行下载`deltalm-base`预训练模型权重，并放置于上面预处理数据目录下
+# 开始训练
+
+## 训练模型
+
+1. 进入源码包根目录。
+
+   ```bash
+   cd /${模型文件夹名称}
+   ```
+
+2. 运行训练脚本。
+
+    该模型支持单机单卡训练和单机8卡训练。
+
+    - 单机单卡训练
+
+      启动单卡训练。
+
+      ```bash
+      bash ./test/train_performance_1p.sh --data_path=/data-bin  # 单卡性能
+      ```
+
+    - 单机8卡训练。
+
+      启动8卡训练。
+
+      ```bash
+      bash ./test/train_full_8p.sh --data_path=/data-bin  # 8卡精度
+      bash ./test/train_performance_8p.sh --data_path=/data-bin  # 8卡性能
+      ```
+
+      --data_path参数填写数据集路径，需写到数据集的一级目录。
+
+
+    模型训练脚本参数说明如下。
+
+    ```
+    公共参数：
+    --data_path                         //数据集路径
+    --arch                              //使用模型架构
+    --save-dir                          //权重文件保存路径
+    --max-epoch                         //重复迭代轮数
+    --max-tokens                        //最大token大小
+    --lr                                //学习率
+    --optimizer                         //使用哪种优化器
+    --eval-bleu                         //使用评估指标
+    --distributed-world-size            //是否进行分布式训练
+    ```
+
+    训练完成后，权重文件默认保存在当前路径的checkpoints目录下，test/out目录下并输出模型训练精度和性能信息。
+
+# 训练结果展示
+
+**表 3**  en_de数据集训练结果展示表
+
+| NAME  | MODE | Bleu  | WPS  | Epochs | AMP_Type | Torch_Version |
+| :---: |------|:-----:|:----:| :---: | :---: | :---: |
+| 8p-竞品A | fp16 | 39.45 | 14401 | 100 | - | 1.8 |
+| 8p-NPU | fp16 | 39.37 | 16214 | 100 | - | 1.8 |
+
+> **说明：**
+   >由于该模型默认开启二进制，所以在性能测试时，需要安装二进制包，安装方式参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。
+
+
+# 版本说明
+
+## 变更
+
+2023.6.29：首次发布。
+
+## FAQ
+
+无。
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README_RAW.md b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README_RAW.md
new file mode 100644
index 0000000000..a90733ad4d
--- /dev/null
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README_RAW.md
@@ -0,0 +1,184 @@
+# [DeltaLM](https://arxiv.org/abs/2106.13736)
+
+**Encoder-Decoder Pre-training for Language Generation and Translation** 
+
+[DeltaLM: Encoder-Decoder Pre-training for Language Generation and Translation by Augmenting Pretrained Multilingual Encoders.](https://arxiv.org/abs/2106.13736) Shuming Ma, Li Dong, Shaohan Huang, Dongdong Zhang, Alexandre Muzio, Saksham Singhal, Hany Hassan Awadalla, Xia Song, Furu Wei. CoRR abs/2106.13736.
+
+[mT6: Multilingual Pretrained Text-to-Text Transformer with Translation Pairs.](https://arxiv.org/abs/2104.08692) Zewen Chi, Li Dong, Shuming Ma, Shaohan Huang, Xian-Ling Mao, Heyan Huang, and Furu Wei. In EMNLP 2021.
+
+- September 2021: DeltaLM ranks first on the [WMT21 multilingual translation task](http://www.statmt.org/wmt21/large-scale-multilingual-translation-task.html).
+- August 2021: release code and pretrained checkpoints.
+
+---
+
+## Pretrained Models
+
+- [DeltaLM-base](https://deltalm.blob.core.windows.net/deltalm/deltalm-base.pt): #enc-dec=12-6; #hidden=768; #head=12; #FFN=3072 (#parameters: 360M)
+- [DeltaLM-large](https://deltalm.blob.core.windows.net/deltalm/deltalm-large.pt): #enc-dec=24-12; #hidden=1024; #head=16; #FFN=4096 (#parameters: 830M)
+- [Vocabulary](https://deltalm.blob.core.windows.net/deltalm/dict.txt) and [Sentencepiece-model](https://deltalm.blob.core.windows.net/deltalm/spm.model)
+- DeltaLM can be finetuned to support language generation and translation tasks for **100+ languages**
+
+
+## Cross-lingual Abstractive Summarization - [Wikilingua](https://arxiv.org/abs/2010.03093)
+
+We evaluate DeltaLM on cross-lingual abstractive summarization benchmark. We report the results by averaging the numbers in different languages. 
+
+|   Model   |   #Params   |  ROUGE-1  |  ROUGE-2  |  ROUGE-L  |
+|-----------|-------------|-----------|-----------|-----------|
+| [mBART](https://arxiv.org/abs/2001.08210)     | 610M        | 34.5      | 12.9      | **28.7**      |
+| [mT5](https://arxiv.org/abs/2010.11934)       | 300M        | 27.5      | 8.8       | 22.8      |
+| [mT5](https://arxiv.org/abs/2010.11934)       | 580M        | 31.8      | 11.5      | 26.0      |
+| DeltaLM   | 360M        | **35.3**      | **13.4**      | **28.7**      |
+
+
+## Setup
+
+```bash
+git submodule update --init deltalm/fairseq
+cd deltalm/
+pip install --editable fairseq/
+```
+
+## Fine-tuning
+
+1. Organize the raw data in the following structure:
+```
+.
++-- /path/to/data/
+|   +-- train.src
+|   +-- train.tgt
+|   +-- valid.src
+|   +-- valid.tgt
+```
+
+*Examples (IWSLT14 German to English)*:
+```bash
+bash examples/prepare_iwslt14.sh /tmp/iwslt14
+```
+
+2. Tokenize the data using [Sentencepiece](https://github.com/google/sentencepiece):
+
+```bash
+spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < train.src > train.spm.src
+spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < train.tgt > train.spm.tgt
+spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < valid.src > valid.spm.src
+spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < valid.tgt > valid.spm.tgt
+spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < test.src > test.spm.src
+spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < test.tgt > test.spm.tgt
+```
+
+*Examples (IWSLT14 German to English)*:
+```bash
+bash examples/binary_iwslt14.sh \
+     /tmp/iwslt14/iwslt14.tokenized.de-en \
+     /tmp/iwslt14/iwslt14.spm \
+     /path/to/checkpoint/spm.model
+```
+
+3. Binary the data:
+
+```bash
+data_bin=/path/to/data-bin/
+python preprocess.py  \
+    --trainpref train.spm \
+    --validpref valid.spm \
+    --testpref test.spm \
+    --source-lang src --target-lang tgt \
+    --destdir $data_bin \
+    --srcdict /path/to/checkpoint/dict.txt \
+    --tgtdict /path/to/checkpoint/dict.txt \
+    --workers 40
+```
+
+*Examples (IWSLT14 German to English)*:
+```bash
+bash examples/binary_iwslt14.sh \
+     /tmp/iwslt14/iwslt14.spm \
+     /tmp/iwslt14/iwslt14.bin \
+     /path/to/checkpoint/dict.txt
+```
+
+4. Fine-tuning:
+
+```bash
+PRETRAINED_MODEL=/path/to/checkpoint/model.pt
+python train.py $data_bin \
+    --save-dir $save_dir \
+    --arch deltalm_base \
+    --pretrained-deltalm-checkpoint $PRETRAINED_MODEL \
+    --share-all-embeddings \
+    --max-source-positions 512 --max-target-positions 512 \
+    --criterion label_smoothed_cross_entropy \
+    --label-smoothing 0.1 \
+    --optimizer adam --adam-betas '(0.9, 0.98)' \
+    --lr-scheduler inverse_sqrt \
+    --lr $lr \
+    --warmup-init-lr 1e-07 \
+    --stop-min-lr 1e-09 \
+    --warmup-updates 4000 \
+    --max-update 400000 \
+    --max-epoch 100 \
+    --max-tokens $batch_size \
+    --update-freq 1 \
+    --seed 1 \
+    --log-format simple \
+    --skip-invalid-size-inputs-valid-test
+```
+**Note: 
+- For large checkpoint, please set `--arch deltalm_large`.
+- Please adjust the `max-tokens` and `update-freq` to suit in different experimental environments. Recommendation of the total batch size is `4096 * 128` tokens per step.
+- Use `--fp16` for more efficient training on the devices that have Tensor Cores.
+
+*Examples (IWSLT14 German to English)*:
+```bash
+bash examples/train_iwslt14.sh \
+     /tmp/iwslt14/iwslt14.bin \
+     /tmp/iwslt14/checkpoints \
+     /path/to/checkpoint/model.pt
+```
+
+5. Evaluation:
+
+```bash
+python generate.py $data_bin \
+    --path $save_dir/checkpoint_best.pt \
+    --batch-size 128 --beam 5 --remove-bpe=sentencepiece
+```
+
+*Examples (IWSLT14 German to English)*:
+```bash
+bash examples/evaluate_iwslt14.sh \
+     /tmp/iwslt14/iwslt14.bin \
+     /tmp/iwslt14/checkpoints
+```
+
+---
+
+## Citation
+
+If you find this repository useful, please consider citing our work:
+```
+@article{deltalm,
+      title={{DeltaLM}: Encoder-Decoder Pre-training for Language Generation and Translation by Augmenting Pretrained Multilingual Encoders}, 
+      author={Shuming Ma and Li Dong and Shaohan Huang and Dongdong Zhang and Alexandre Muzio and Saksham Singhal and Hany Hassan Awadalla and Xia Song and Furu Wei},
+      year={2021},
+      eprint={2106.13736},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+## Acknowledgement
+
+This repository is built using the [Fairseq](https://github.com/pytorch/fairseq) repository.
+
+## License
+This project is licensed under the license found in the LICENSE file in the root directory of this source tree.
+
+[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)
+
+### Contact Information
+
+For help or issues using DeltaLM models, please submit a GitHub issue.
+
+For other communications related to DeltaLM, please contact Shuming Ma (`shumma@microsoft.com`), [Furu Wei](http://gitnlp.org/) (`fuwei@microsoft.com`).
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu
deleted file mode 100644
index bd6106cba0..0000000000
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
-Copyright (c) Microsoft Corporation.
-Licensed under the MIT License.
-*/
-
-/*
-Kernel implementation for blocking repeated n-grams.
-*/
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <math.h>
-#include <torch/extension.h>
-#include <vector>
-
-// Ban repeated ngrams of length = 'no_repeat_ngram_size'
-__global__ void banRepeatedTokens(
-    long* __restrict__ tokens,
-    float* __restrict__ lprobs,
-    int max_predict_len,
-    int vocab_size,
-    int no_repeat_ngram_size) {
-  auto row = blockIdx.x;
-  auto col = threadIdx.x;
-  auto start = row * (max_predict_len) + col;
-  // Each thread compares ngram starting from
-  // thread index with final ngram starting from
-  // step - no_repeat_ngram_size +2
-  auto check_start_pos = blockDim.x;
-  auto lprob_start = row * vocab_size;
-  bool is_banned = true;
-  extern __shared__ long tokens_shm[];
-  tokens_shm[col] = tokens[start];
-  if (col == blockDim.x - 1) {
-    for (int i = 1; i < no_repeat_ngram_size; i++) {
-      if (col + i < max_predict_len) {
-        tokens_shm[col + i] = tokens[start + i];
-      }
-    }
-  }
-  __syncthreads();
-
-  for (int k = 0; k < no_repeat_ngram_size - 1; k++) {
-    if (tokens_shm[col + k] != tokens_shm[check_start_pos + k]) {
-      is_banned = false;
-    }
-  }
-  if (is_banned == true) {
-    auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1];
-    lprobs[lprob_start + token_to_be_banned] = -INFINITY;
-  }
-}
-
-// Allocate blocks and threads based on
-// batch size and sequence length and launch
-// kernel
-torch::Tensor ngram_repeat_block_cuda_forward(
-    const torch::Tensor tokens,
-    torch::Tensor lprobs,
-    int bsz,
-    int step,
-    int beam_size,
-    int no_repeat_ngram_size) {
-  int threads = step - no_repeat_ngram_size + 2;
-  if (threads <= 0)
-    return lprobs;
-  int max_predict_len = tokens.size(1);
-  int vocab_size = lprobs.size(1);
-  auto token_ptr = tokens.data_ptr<long>();
-  auto lprob_ptr = lprobs.data_ptr<float>();
-  int blocks = bsz * beam_size;
-  int shared_mem_size = (step + 1) * sizeof(long);
-
-  // Launching N blocks where N is number of samples in a batch (beams*bsz)
-  // Launching T threads where T is number of previous ngrams in a sample
-  // Allocating shared mem per block for fastser access of input tokens since
-  // each token will be accessed N times to compare with current Ngram where
-  // N is Ngram size.
-  banRepeatedTokens<<<blocks, threads, shared_mem_size>>>(
-      token_ptr, lprob_ptr, max_predict_len, vocab_size, no_repeat_ngram_size);
-  return lprobs;
-}
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/libnat_cuda/edit_dist.cu b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/libnat_cuda/edit_dist.cu
deleted file mode 100644
index 96569d46c8..0000000000
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/libnat_cuda/edit_dist.cu
+++ /dev/null
@@ -1,344 +0,0 @@
-/**
- * Copyright 2017-present, Facebook, Inc.
- * All rights reserved.
- *
- * This source code is licensed under the license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "edit_dist.h"
-
-#include <THC/THC.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
-#include <utility> // std::pair
-
-template <typename scalar_t>
-__global__ void generate_deletion_label_kernel(
-    const scalar_t* __restrict__ source,
-    const size_t source_size,
-    const size_t operation_size,
-    int* __restrict__ operations,
-    int* __restrict__ labels) {
-  const int index = blockIdx.x;
-  const int offset = index * operation_size;
-  const int offset_label = index * source_size;
-
-  for (int i = 0; i < source_size; i++) {
-    labels[offset_label + i] = 0;
-  }
-
-  int k = 0;
-  for (int i = 0; i < operation_size; i++) {
-    if (operations[offset + i] == 0) {
-      break;
-    } else if (operations[offset + i] == 1) {
-      continue;
-    } else {
-      labels[offset_label + k] = 3 - operations[offset + i];
-      k++;
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void generate_insertion_label_kernel(
-    const scalar_t* __restrict__ target,
-    const size_t target_size,
-    const size_t operation_size,
-    int* __restrict__ operations,
-    int* __restrict__ labels,
-    int* __restrict__ masks) {
-  const int index = blockIdx.x;
-  const int offset = index * operation_size;
-  const int offset_label = index * target_size;
-
-  int k = 0;
-  int u = 0;
-  int m = 0;
-
-  for (int i = 0; i < target_size; i++) {
-    labels[offset_label + i] = 0;
-    masks[offset_label + i] = 0;
-  }
-
-  for (int i = 0; i < operation_size - 1; i++) {
-    if (operations[offset + i] == 0) {
-      break;
-    } else if (operations[offset + i] == 2) {
-      continue;
-    } else if (operations[offset + i] == 1) {
-      masks[offset_label + m] = 1;
-      u++;
-      m++;
-    } else {
-      labels[offset_label + k] = u;
-      masks[offset_label + m] = 0;
-      k++;
-      m++;
-      u = 0;
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void levenshtein_distance_kernel(
-    const scalar_t* __restrict__ source,
-    const scalar_t* __restrict__ target,
-    const int* __restrict__ source_length,
-    const int* __restrict__ target_length,
-    const size_t source_size,
-    const size_t target_size,
-    int* __restrict__ operations,
-    int* __restrict__ errors_curr) {
-  const int index = blockIdx.x;
-  const int offset = index * (source_size + target_size);
-  const int d = index * (source_size + 1) * (target_size + 1);
-  const int t = target_size + 1;
-
-  auto err_idx = [d, t](int i, int j) { return d + i * t + j; };
-  auto opt_idx = [offset](int k) { return offset + k; };
-
-  const int hyp_len = source_length[index];
-  const int ref_len = target_length[index];
-  const scalar_t* hyp_begin = source + index * source_size;
-  const scalar_t* ref_begin = target + index * target_size;
-
-  // dynamic programming
-  for (int i = 0; i <= hyp_len; i++) {
-    errors_curr[err_idx(i, 0)] = i;
-  }
-  for (int j = 0; j <= ref_len; j++) {
-    errors_curr[err_idx(0, j)] = j;
-  }
-  for (int i = 1; i <= hyp_len; i++) {
-    for (int j = 1; j <= ref_len; j++) {
-      errors_curr[err_idx(i, j)] = min(
-          min(errors_curr[err_idx(i - 1, j)], errors_curr[err_idx(i, j - 1)]) +
-              1,
-          errors_curr[err_idx(i - 1, j - 1)] +
-              2 * (*(hyp_begin + i - 1) == *(ref_begin + j - 1) ? 0 : 1));
-    }
-  }
-
-  // back-tracing
-  int i = hyp_len;
-  int j = ref_len;
-  int o = hyp_len + ref_len;
-
-  for (int k = 0; k < source_size + target_size; k++) {
-    operations[opt_idx(k)] = 0;
-  }
-
-  while ((i >= 0) && (j >= 0)) {
-    if ((i == 0) && (j == 0)) {
-      break;
-    }
-
-    if ((j > 0) &&
-        (errors_curr[err_idx(i, j - 1)] < errors_curr[err_idx(i, j)])) {
-      o--;
-      operations[opt_idx(o)] = 1;
-      j--; // insertion
-    } else if (
-        (i > 0) &&
-        (errors_curr[err_idx(i - 1, j)] < errors_curr[err_idx(i, j)])) {
-      o--;
-      operations[opt_idx(o)] = 2;
-      i--; // deletion
-    } else {
-      o--;
-      operations[opt_idx(o)] = 3;
-      i--;
-      j--; // do nothing
-    }
-  }
-
-  // moving to the left
-  for (int k = 0; k < hyp_len + ref_len; k++) {
-    if (k + o < hyp_len + ref_len) {
-      operations[opt_idx(k)] = operations[opt_idx(k + o)];
-    } else {
-      operations[opt_idx(k)] = 0; // padding
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void faster_levenshtein_distance_kernel(
-    const scalar_t* __restrict__ source,
-    const scalar_t* __restrict__ target,
-    const int* __restrict__ source_length,
-    const int* __restrict__ target_length,
-    const size_t source_size,
-    const size_t target_size,
-    int* __restrict__ operations) {
-  extern __shared__ short errors[];
-  auto errors_curr = errors;
-
-  const int index = blockIdx.x;
-  const int offset = index * (source_size + target_size);
-  const int t = target_size + 1;
-
-  auto err_idx = [t](int i, int j) { return i * t + j; };
-  auto opt_idx = [offset](int k) { return offset + k; };
-
-  const int hyp_len = source_length[index];
-  const int ref_len = target_length[index];
-  const scalar_t* hyp_begin = source + index * source_size;
-  const scalar_t* ref_begin = target + index * target_size;
-
-  // dynamic programming
-  for (int i = 0; i <= hyp_len; i++) {
-    errors_curr[err_idx(i, 0)] = i;
-  }
-  for (int j = 0; j <= ref_len; j++) {
-    errors_curr[err_idx(0, j)] = j;
-  }
-  for (int i = 1; i <= hyp_len; i++) {
-    for (int j = 1; j <= ref_len; j++) {
-      errors_curr[err_idx(i, j)] = min(
-          min(errors_curr[err_idx(i - 1, j)], errors_curr[err_idx(i, j - 1)]) +
-              1,
-          errors_curr[err_idx(i - 1, j - 1)] +
-              2 * (*(hyp_begin + i - 1) == *(ref_begin + j - 1) ? 0 : 1));
-    }
-  }
-
-  // back-tracing
-  int i = hyp_len;
-  int j = ref_len;
-  int o = hyp_len + ref_len;
-
-  for (int k = 0; k < source_size + target_size; k++) {
-    operations[opt_idx(k)] = 0;
-  }
-
-  while ((i >= 0) && (j >= 0)) {
-    if ((i == 0) && (j == 0)) {
-      break;
-    }
-
-    if ((j > 0) &&
-        (errors_curr[err_idx(i, j - 1)] < errors_curr[err_idx(i, j)])) {
-      o--;
-      operations[opt_idx(o)] = 1;
-      j--; // insertion
-    } else if (
-        (i > 0) &&
-        (errors_curr[err_idx(i - 1, j)] < errors_curr[err_idx(i, j)])) {
-      o--;
-      operations[opt_idx(o)] = 2;
-      i--; // deletion
-    } else {
-      o--;
-      operations[opt_idx(o)] = 3;
-      i--;
-      j--; // do nothing
-    }
-  }
-
-  // moving to the left
-  for (int k = 0; k < hyp_len + ref_len; k++) {
-    if (k + o < hyp_len + ref_len) {
-      operations[opt_idx(k)] = operations[opt_idx(k + o)];
-    } else {
-      operations[opt_idx(k)] = 0; // padding
-    }
-  }
-}
-
-torch::Tensor GenerateDeletionLabelCuda(
-    torch::Tensor source,
-    torch::Tensor operations) {
-  const auto batch_size = source.size(0);
-  at::TensorOptions options(source.device());
-  options = options.dtype(at::ScalarType::Int);
-  auto labels = torch::empty({batch_size, source.size(1)}, options);
-  auto stream = at::cuda::getCurrentCUDAStream(source.device().index());
-
-  AT_DISPATCH_ALL_TYPES(source.scalar_type(), "generate_deletion_labels", ([&] {
-                          generate_deletion_label_kernel<scalar_t>
-                              <<<batch_size, 1, 0, stream>>>(
-                                  source.data_ptr<scalar_t>(),
-                                  source.size(1),
-                                  operations.size(1),
-                                  operations.data_ptr<int>(),
-                                  labels.data_ptr<int>());
-                        }));
-
-  return labels;
-}
-
-std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabelCuda(
-    torch::Tensor target,
-    torch::Tensor operations) {
-  const auto batch_size = target.size(0);
-  at::TensorOptions options(target.device());
-  options = options.dtype(at::ScalarType::Int);
-  auto labels = torch::empty({batch_size, target.size(1)}, options);
-  auto masks = torch::empty({batch_size, target.size(1)}, options);
-  auto stream = at::cuda::getCurrentCUDAStream(target.device().index());
-
-  AT_DISPATCH_ALL_TYPES(
-      target.scalar_type(), "generate_insertion_labels", ([&] {
-        generate_insertion_label_kernel<scalar_t><<<batch_size, 1, 0, stream>>>(
-            target.data_ptr<scalar_t>(),
-            target.size(1),
-            operations.size(1),
-            operations.data_ptr<int>(),
-            labels.data_ptr<int>(),
-            masks.data_ptr<int>());
-      }));
-
-  return std::make_pair(labels, masks);
-}
-
-torch::Tensor LevenshteinDistanceCuda(
-    torch::Tensor source,
-    torch::Tensor target,
-    torch::Tensor source_length,
-    torch::Tensor target_length) {
-  const auto batch_size = source.size(0);
-  const auto shared_size =
-      (source.size(1) + 1) * (target.size(1) + 1) * sizeof(short);
-
-  at::TensorOptions options(source.device());
-  options = options.dtype(at::ScalarType::Int);
-  auto operations =
-      torch::empty({batch_size, source.size(1) + target.size(1)}, options);
-  auto stream = at::cuda::getCurrentCUDAStream(source.device().index());
-
-  if (shared_size > 40000) {
-    auto distances = torch::empty(
-        {batch_size, (source.size(1) + 1) * (target.size(1) + 1)}, options);
-    AT_DISPATCH_ALL_TYPES(source.scalar_type(), "levenshtein_distance", ([&] {
-                            levenshtein_distance_kernel<scalar_t>
-                                <<<batch_size, 1, 0, stream>>>(
-                                    source.data_ptr<scalar_t>(),
-                                    target.data_ptr<scalar_t>(),
-                                    source_length.data_ptr<int>(),
-                                    target_length.data_ptr<int>(),
-                                    source.size(1),
-                                    target.size(1),
-                                    operations.data_ptr<int>(),
-                                    distances.data_ptr<int>());
-                          }));
-  } else {
-    AT_DISPATCH_ALL_TYPES(
-        source.scalar_type(), "faster_levenshtein_distance", ([&] {
-          faster_levenshtein_distance_kernel<scalar_t>
-              <<<batch_size, 1, shared_size, stream>>>(
-                  source.data_ptr<scalar_t>(),
-                  target.data_ptr<scalar_t>(),
-                  source_length.data_ptr<int>(),
-                  target_length.data_ptr<int>(),
-                  source.size(1),
-                  target.size(1),
-                  operations.data_ptr<int>());
-        }));
-  }
-
-  return operations;
-}
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/cuda_utils.cu b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/cuda_utils.cu
deleted file mode 100644
index 924f852758..0000000000
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/cuda_utils.cu
+++ /dev/null
@@ -1,202 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-template <typename U, typename V>
-constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) {
-  return (a + b - 1) / b;
-}
-
-template <int FS, int SB, int padding_l, typename scalar_t>
-__inline__ __device__ void zeroSharedMem(scalar_t* data) {
-  /*
-    Given an array of length FS + SB, zero out the first padding_l and last
-    (FS - padding_l) values in the array
-  */
-
-  int tid = threadIdx.x;
-
-  if (FS < SB) {
-    // zero all if we have enough threads in a block to do all of them
-    if (tid < padding_l || tid > SB - FS + padding_l - 1) {
-      data[tid] = scalar_t(0.0);
-    }
-  } else {
-    // otherwise zero out one block at a time
-    const int numIterations = divUp<int, int>(FS, SB);
-    for (int i = 0; i < numIterations; i++) {
-      int offset = i * SB;
-      if (tid + offset < padding_l) {
-        data[tid + offset] = scalar_t(0.0);
-      } else if (tid + offset < FS) {
-        data[SB + tid + offset] = scalar_t(0.0);
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__inline__ __device__ scalar_t warpReduce(scalar_t data) {
-  /*
-    Reduce an array within each warp. After processing all values in warp will
-    caontain the sum of all original values in that warp.
-
-    data - pointer to data to reduce
-  */
-  data += __shfl_xor_sync(SHFL_MASK, data, 16);
-  data += __shfl_xor_sync(SHFL_MASK, data, 8);
-  data += __shfl_xor_sync(SHFL_MASK, data, 4);
-  data += __shfl_xor_sync(SHFL_MASK, data, 2);
-  data += __shfl_xor_sync(SHFL_MASK, data, 1);
-  return data;
-}
-
-template <typename scalar_t>
-__inline__ __device__ scalar_t blockReduce(scalar_t data) {
-  /*
-     Reduce an entire array on the block level. After processing, the
-     first value in the array will contain the reduced sum.
-
-     data - pointer to data to reduce
-  */
-
-  static __shared__ scalar_t warpSum[32];
-  const int tid = threadIdx.x;
-  int wid = tid / 32;
-  int lane = tid % 32;
-
-  __syncthreads();
-
-  // reduce each warp then write to shared memory
-  scalar_t sum = warpReduce(data);
-  if (lane == 0) {
-    warpSum[wid] = sum;
-  }
-
-  __syncthreads();
-
-  scalar_t v;
-  // perform final sum of partial warp sums
-  if (tid < blockDim.x / 32) {
-    v = warpSum[lane];
-  } else {
-    v = scalar_t(0.0);
-  }
-
-  if (wid == 0) {
-    v = warpReduce(v);
-  }
-  __syncthreads();
-
-  return v;
-}
-
-void checkCudaStatus(cudaError_t status, int lineNumber = -1) {
-  if (status != cudaSuccess) {
-    std::cout << cudaGetErrorString(status) << " at line " << lineNumber
-              << std::endl;
-    std::cout << "Exiting" << std::endl;
-    exit(1);
-  }
-}
-
-template <int FS, int SB, int padding_l, typename scalar_t>
-__device__ void load_input_to_shared(
-    const scalar_t* input, // global memory
-    int inputOffset,
-    int sequenceLength,
-    int iteration,
-    int numIterations,
-    bool no_prev,
-    scalar_t* output /* shared memory */) {
-  /*
-    Load a block size of input into shared memory with
-    right and left overhang of total size FS. If previously
-    loaded memory, overlap will be shifted over to reduce
-    global memory access
-
-    input - pointer to start of channel sequence
-    inputOffset - how far in the sequence to start loading
-    sequenceLength - total length of sequence
-    iteration - which block of sequence we are loading
-    numIterations - total number of blocks to load
-    no_prev - whether to load the whole block if the previous block
-              wasn't loaded
-    output - shared memory to write input to
-  */
-
-  const int tid = threadIdx.x;
-
-  // Load the left "overhang" of input
-  if (iteration > 0) {
-    if (padding_l < SB) {
-      // load all at once
-      if (tid < padding_l) {
-        output[tid] =
-            (no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB];
-      }
-    } else {
-      // load in chunks of size SB
-      int numIterations = divUp<int, int>(padding_l, SB);
-      for (int i = 0; i < numIterations; i++) {
-        int offset = i * SB;
-        if ((tid + offset) < padding_l) {
-          output[tid + offset] = (no_prev)
-              ? input[inputOffset - padding_l + tid + offset]
-              : output[tid + offset + SB];
-        }
-      }
-    }
-  }
-
-  // Load the right "overhang" of input
-  if (iteration < (numIterations - 1)) {
-    const int elementsLeft = sequenceLength - (iteration + 1) * SB;
-
-    if ((FS - padding_l) < SB) {
-      // load all at once
-      if (tid < (FS - padding_l)) {
-        output[padding_l + SB + tid] = (tid < elementsLeft)
-            ? input[inputOffset + SB + tid]
-            : scalar_t(0.0);
-      }
-    } else {
-      // load in chunks of size SB
-      int numIterations = divUp<int, int>(FS - padding_l, SB);
-      for (int i = 0; i < numIterations; i++) {
-        int offset = i * SB;
-        if ((tid + offset) < (FS - padding_l)) {
-          output[padding_l + SB + tid + offset] =
-              ((tid + offset) < elementsLeft)
-              ? input[inputOffset + SB + tid + offset]
-              : scalar_t(0.0);
-        }
-      }
-    }
-  }
-
-  // We should also clear out the right "overhang"
-  if (iteration == (numIterations - 1)) {
-    if ((FS - padding_l) < SB) {
-      // clear out all at once
-      if (tid < (FS - padding_l)) {
-        output[padding_l + SB + tid] = scalar_t(0.0);
-      }
-    } else {
-      // clear in chunks of size SB
-      int numIterations = divUp<int, int>(FS - padding_l, SB);
-      for (int i = 0; i < numIterations; i++) {
-        int offset = i * SB;
-        if ((tid + offset) < (FS - padding_l)) {
-          output[padding_l + SB + tid + offset] = scalar_t(0.0);
-        }
-      }
-    }
-  }
-  output[tid + padding_l] = ((inputOffset + tid) < sequenceLength)
-      ? input[inputOffset + tid]
-      : scalar_t(0.0);
-}
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
deleted file mode 100644
index 4630f1e982..0000000000
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu
+++ /dev/null
@@ -1,176 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "../cuda_utils.cu"
-#include "dynamicconv_cuda.cuh"
-#include "dynamicconv_cuda_backward.cu"
-#include "dynamicconv_cuda_forward.cu"
-
-// FS is filter size and kernels are specialized for filter sizes
-template <int FS, int SB, int padding_l, typename scalar_t>
-__global__ void dynamicconv_forward_kernel(
-    const scalar_t* input,
-    const scalar_t* weight,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    int numHeads,
-    scalar_t* output) {
-  assert(blockDim.x == SB);
-
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int featureIdx = blockIdx.y;
-  const int head = featureIdx / numFiltersInBlock;
-
-  const int IOOffset =
-      batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength;
-  const scalar_t* inputFeature = &input[IOOffset];
-  scalar_t* outputFeature = &output[IOOffset];
-
-  scalar_t filter[FS];
-
-  __shared__ scalar_t tempInput[SB + FS];
-  zeroSharedMem<FS, SB, padding_l>(tempInput);
-
-  const int numIterations = divUp<int, int>(sequenceLength, SB);
-
-  for (int i = 0; i < numIterations; ++i) {
-    __syncthreads();
-    const int inputOffset = i * SB;
-    load_input_to_shared<FS, SB, padding_l>(
-        inputFeature,
-        inputOffset,
-        sequenceLength,
-        i,
-        numIterations,
-        false,
-        tempInput);
-    __syncthreads();
-    if (inputOffset + tid < sequenceLength) {
-#pragma unroll
-      for (int k = 0; k < FS; ++k) {
-        const int filterOffset = batchIdx * numHeads * FS * sequenceLength +
-            head * FS * sequenceLength + k * sequenceLength + i * SB + tid;
-        filter[k] = weight[filterOffset];
-      }
-
-      scalar_t out = scalar_t(0.0);
-#pragma unroll
-      for (int k = 0; k < FS; ++k) {
-        out += filter[k] * tempInput[tid + k];
-      }
-
-      outputFeature[inputOffset + tid] = out;
-    }
-  }
-}
-
-template <int FS, int SB, int padding_l, typename scalar_t>
-__global__ void dynamicconv_backward_kernel(
-    const scalar_t* gradOutput, // B * C * T
-    const scalar_t* input, // B * C * T
-    const scalar_t* weight,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    int numHeads,
-    scalar_t* gradWeight,
-    scalar_t* gradInput) { // B * H * k * T
-
-  assert(blockDim.x == SB);
-
-  // each block operates on a single batch and filter head
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int headIdx = blockIdx.y;
-  const int chunkIdx = blockIdx.z;
-
-  const int numChunks = divUp<int, int>(sequenceLength, SB);
-  const int inputOffset = chunkIdx * SB;
-
-  // initialize shared memory for output gradient and input
-  __shared__ scalar_t tempGradOutput[SB + FS];
-  __shared__ scalar_t tempInput[SB + FS];
-  const int padding = FS - padding_l - 1;
-
-  zeroSharedMem<FS, SB, padding>(tempGradOutput);
-  zeroSharedMem<FS, SB, padding_l>(tempInput);
-
-  // initialize local filter and weight gradient sum arrays
-  scalar_t tempGradSum[FS];
-  scalar_t bfilter[FS];
-  for (int k = 0; k < FS; ++k) {
-    tempGradSum[k] = scalar_t(0.0);
-
-    int idxOffset = inputOffset + tid + k - padding;
-    if (idxOffset >= 0 && idxOffset < sequenceLength) {
-      int bfilterOffset = batchIdx * numHeads * FS * sequenceLength +
-          headIdx * FS * sequenceLength + (FS - k - 1) * sequenceLength +
-          idxOffset;
-      bfilter[k] = weight[bfilterOffset];
-    } else {
-      bfilter[k] = scalar_t(0.0);
-    }
-  }
-
-  // iterate over filter block
-  for (int featureIdx = 0; featureIdx < numFiltersInBlock; ++featureIdx) {
-    __syncthreads();
-
-    // load input and output gradient for this channel and chunk
-    const int IOOffset = batchIdx * numFeatures * sequenceLength +
-        (headIdx * numFiltersInBlock + featureIdx) * sequenceLength;
-    const scalar_t* inputFeature = &input[IOOffset];
-    const scalar_t* gradOutputFeature = &gradOutput[IOOffset];
-    scalar_t* gradInputFeature = &gradInput[IOOffset];
-
-    load_input_to_shared<FS, SB, padding>(
-        gradOutputFeature,
-        inputOffset,
-        sequenceLength,
-        chunkIdx,
-        numChunks,
-        true,
-        tempGradOutput);
-    load_input_to_shared<FS, SB, padding_l>(
-        inputFeature,
-        inputOffset,
-        sequenceLength,
-        chunkIdx,
-        numChunks,
-        true,
-        tempInput);
-    __syncthreads();
-
-    // sum input and weight gradients
-    scalar_t out = scalar_t(0.0);
-#pragma unroll
-    for (int k = 0; k < FS; ++k) {
-      tempGradSum[k] += tempInput[tid + k] * tempGradOutput[tid + padding];
-      out += bfilter[k] * tempGradOutput[tid + k];
-    }
-
-    if (inputOffset + tid < sequenceLength) {
-      gradInputFeature[inputOffset + tid] = out;
-    }
-  }
-
-  const int gradOffset =
-      batchIdx * numHeads * FS * sequenceLength + headIdx * FS * sequenceLength;
-  scalar_t* gradWeightFeature = &gradWeight[gradOffset];
-
-  // write weight gradient
-  if (inputOffset + tid < sequenceLength) {
-    for (int k = 0; k < FS; ++k) {
-      const int outputOffset = k * sequenceLength + inputOffset + tid;
-      gradWeightFeature[outputOffset] = tempGradSum[k];
-    }
-  }
-}
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
deleted file mode 100644
index cdf31d5d2d..0000000000
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu
+++ /dev/null
@@ -1,400 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "../cuda_utils.cu"
-#include "lightconv_cuda.cuh"
-#include "lightconv_cuda_backward.cu"
-#include "lightconv_cuda_forward.cu"
-
-template <int FS, int SB, int padding_l, typename scalar_t>
-__global__ void lightconv_forward_kernel(
-    const scalar_t* input,
-    const scalar_t* filters,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    scalar_t* output) {
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int featureIdx = blockIdx.y;
-  const int filterIdx = featureIdx / numFiltersInBlock;
-
-  const int IOOffset =
-      numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength;
-  const scalar_t* inputFeature = &input[IOOffset];
-  scalar_t* outputFeature = &output[IOOffset];
-  const scalar_t* inputFilter = &filters[filterIdx * FS];
-
-  assert(blockDim.x == SB);
-
-  scalar_t filter[FS];
-#pragma unroll
-  for (int i = 0; i < FS; ++i) {
-    filter[i] = inputFilter[i];
-  }
-
-  __shared__ scalar_t temp[SB + FS];
-  zeroSharedMem<FS, SB, padding_l>(temp);
-
-  const int numIterations = divUp<int, int>(sequenceLength, SB);
-
-  for (int i = 0; i < numIterations; ++i) {
-    // Read input into shared memory
-    const int inputOffset = i * SB;
-
-    load_input_to_shared<FS, SB, padding_l>(
-        inputFeature,
-        inputOffset,
-        sequenceLength,
-        i,
-        numIterations,
-        (numIterations == 1),
-        temp);
-
-    __syncthreads();
-
-    scalar_t out = 0;
-#pragma unroll
-    for (int j = 0; j < FS; ++j) {
-      out += filter[j] * temp[tid + j];
-    }
-
-    // Write output
-    const int outputOffset = inputOffset;
-    if ((outputOffset + tid) < sequenceLength) {
-      outputFeature[outputOffset + tid] = out;
-    }
-
-    __syncthreads();
-  }
-}
-
-template <int FS, int SB, int padding_l, typename scalar_t>
-__global__ void lightconv_grad_wrt_input_kernel(
-    const scalar_t* input,
-    const scalar_t* filters,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    scalar_t* output) {
-  // input grad kernel is similar to forward kernel
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int featureIdx = blockIdx.y;
-  const int filterIdx = featureIdx / numFiltersInBlock;
-
-  const int IOOffset =
-      numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength;
-  const scalar_t* inputFeature = &input[IOOffset];
-  scalar_t* outputFeature = &output[IOOffset];
-  const scalar_t* inputFilter = &filters[filterIdx * FS];
-
-  assert(blockDim.x == SB);
-
-  scalar_t filter[FS];
-
-// The only change is loading the filter in reverse
-#pragma unroll
-  for (int i = 0; i < FS; ++i) {
-    filter[i] = inputFilter[FS - i - 1];
-  }
-
-  __shared__ scalar_t temp[SB + FS];
-  const int padding = FS - padding_l - 1;
-  zeroSharedMem<FS, SB, padding>(temp);
-
-  __syncthreads();
-
-  const int numIterations = divUp<int, int>(sequenceLength, SB);
-
-  for (int i = 0; i < numIterations; ++i) {
-    // Read input into shared memory
-    const int inputOffset = i * SB;
-
-    load_input_to_shared<FS, SB, padding>(
-        inputFeature,
-        inputOffset,
-        sequenceLength,
-        i,
-        numIterations,
-        false,
-        temp);
-
-    __syncthreads();
-
-    scalar_t out = 0;
-#pragma unroll
-    for (int j = 0; j < FS; ++j) {
-      out += filter[j] * temp[tid + j];
-    }
-
-    // Write output
-    const int outputOffset = inputOffset;
-    if ((outputOffset + tid) < sequenceLength) {
-      outputFeature[outputOffset + tid] = out;
-    }
-
-    __syncthreads();
-  }
-}
-
-// This is by far the most expensive kernel in terms of time taken.
-// Can be 16x slower than the forward or grad_wrt_input when filter size is 31
-template <int FS, int SB, int padding_l, typename scalar_t>
-__global__ void lightconv_grad_wrt_weights_firstpass_short_kernel(
-    const scalar_t* input,
-    const scalar_t* gradInput,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    int numHeads,
-    float* output) {
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int filterIdx = blockIdx.y;
-
-  const int numIterations = divUp<int, int>(sequenceLength, SB);
-
-  float* tempOutputGradWeight = &output[filterIdx * FS * minibatch];
-
-  assert(blockDim.x == SB);
-
-  __shared__ scalar_t tempInput[SB + FS];
-  __shared__ scalar_t tempGradInput[SB + FS];
-
-  // local weight accumulation
-  float accumWeights[FS];
-
-  // Initialize memory
-  for (int i = 0; i < FS; ++i) {
-    accumWeights[i] = float(0.0);
-  }
-
-  // loop over each sequence within filterblock
-  for (int idxInFilterBlock = 0; idxInFilterBlock < numFiltersInBlock;
-       ++idxInFilterBlock) {
-    const int featureOffset = batchIdx * numFeatures * sequenceLength +
-        (filterIdx * numFiltersInBlock + idxInFilterBlock) * sequenceLength;
-    const scalar_t* inputFeature = &input[featureOffset];
-    const scalar_t* gradInputFeature = &gradInput[featureOffset];
-
-    zeroSharedMem<FS, SB, padding_l>(tempInput);
-    zeroSharedMem<FS, SB, (FS / 2)>(tempGradInput);
-    __syncthreads();
-
-    for (int i = 0; i < numIterations; ++i) {
-      const int inputOffset = i * SB;
-
-      load_input_to_shared<FS, SB, padding_l>(
-          inputFeature,
-          inputOffset,
-          sequenceLength,
-          i,
-          numIterations,
-          false,
-          tempInput);
-      load_input_to_shared<FS, SB, (FS / 2)>(
-          gradInputFeature,
-          inputOffset,
-          sequenceLength,
-          i,
-          numIterations,
-          false,
-          tempGradInput);
-
-      __syncthreads();
-
-      const int gradIndex = (FS / 2) + tid;
-      scalar_t tempGrad = tempGradInput[gradIndex];
-
-#pragma unroll
-      for (int j = 0; j < FS; j++) {
-        const int inputIndex = tid + j;
-        accumWeights[j] += tempInput[inputIndex] * tempGrad;
-      }
-
-      __syncthreads();
-    }
-  }
-
-  // Row-major sum
-  for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) {
-    float temp;
-    if (tid < sequenceLength) {
-      temp = accumWeights[filterWeightIdx];
-    } else {
-      temp = float(0.0);
-    }
-
-    const int outputOffset = filterWeightIdx * minibatch + batchIdx;
-
-    temp = blockReduce(temp);
-
-    if (tid == 0) {
-      tempOutputGradWeight[outputOffset] = temp;
-    }
-  }
-}
-
-template <int FS, int SB, typename scalar_t>
-__global__ void lightconv_grad_wrt_weights_secondpass_short_kernel(
-    const float* input,
-    const int minibatch,
-    const int numFiltersInBlock,
-    scalar_t* output) {
-  assert(blockDim.x == SB);
-
-  const int tid = threadIdx.x;
-
-  const int filterIdx = blockIdx.x;
-  const int filterWeightIdx = blockIdx.y;
-
-  const int inputOffset =
-      filterIdx * FS * minibatch + filterWeightIdx * minibatch;
-  const float* tempInput = &input[inputOffset];
-
-  // read into shared memory for reduction
-  int readIndex = tid;
-
-  float sum = 0.0;
-  while (readIndex < minibatch) {
-    sum += tempInput[readIndex];
-    readIndex += SB;
-  }
-
-  float temp = blockReduce(sum);
-
-  if (tid == 0) {
-    output[blockIdx.x * FS + blockIdx.y] = temp;
-  }
-}
-
-// This is by far the most expensive kernel in terms of time taken.
-// Can be 16x slower than the forward or grad_wrt_input when filter size is 31
-template <int FS, int SB, int padding_l, typename scalar_t>
-__global__ void lightconv_grad_wrt_weights_firstpass_kernel(
-    const scalar_t* input,
-    const scalar_t* gradInput,
-    int minibatch,
-    int sequenceLength,
-    int numFeatures,
-    int numFiltersInBlock,
-    float* output) {
-  assert(blockDim.x == SB);
-
-  const int tid = threadIdx.x;
-  const int batchIdx = blockIdx.x;
-  const int featureIdx = blockIdx.y;
-  const int filterIdx = featureIdx / numFiltersInBlock;
-  const int idxInFilterBlock = featureIdx % numFiltersInBlock;
-
-  const int numIterations = divUp<int, int>(sequenceLength, SB);
-
-  float temp;
-
-  __shared__ scalar_t tempInput[SB + FS];
-  __shared__ scalar_t tempGradInput[SB + FS];
-  zeroSharedMem<FS, SB, padding_l>(tempInput);
-  zeroSharedMem<FS, SB, (FS / 2)>(tempGradInput);
-  __syncthreads();
-
-  float accumWeights[FS];
-
-  for (int i = 0; i < FS; ++i) {
-    accumWeights[i] = float(0.0);
-  }
-
-  const int IOOffset =
-      batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength;
-  const scalar_t* inputFeature = &input[IOOffset];
-  const scalar_t* gradInputFeature = &gradInput[IOOffset];
-  float* tempOutputGradWeight =
-      &output[filterIdx * FS * minibatch * numFiltersInBlock];
-
-  for (int i = 0; i < numIterations; ++i) {
-    const int inputOffset = i * SB;
-
-    load_input_to_shared<FS, SB, padding_l>(
-        inputFeature,
-        inputOffset,
-        sequenceLength,
-        i,
-        numIterations,
-        false,
-        tempInput);
-    load_input_to_shared<FS, SB, (FS / 2)>(
-        gradInputFeature,
-        inputOffset,
-        sequenceLength,
-        i,
-        numIterations,
-        false,
-        tempGradInput);
-    __syncthreads();
-
-#pragma unroll
-    for (int j = 0; j < FS; ++j) {
-      accumWeights[j] += tempInput[tid + j] * tempGradInput[tid + (FS / 2)];
-    }
-
-    __syncthreads();
-  }
-
-  // Row-major sum
-  for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) {
-    // Write to shared memory before reduction
-    if (tid < sequenceLength) {
-      temp = accumWeights[filterWeightIdx];
-    } else {
-      temp = float(0.0);
-    }
-
-    temp = blockReduce(temp);
-
-    const int outputOffset = filterWeightIdx * minibatch * numFiltersInBlock +
-        batchIdx * numFiltersInBlock + idxInFilterBlock;
-
-    if (tid == 0) {
-      tempOutputGradWeight[outputOffset] = temp;
-    }
-  }
-}
-
-template <int FS, int SB, typename scalar_t>
-__global__ void lightconv_grad_wrt_weights_secondpass_kernel(
-    const float* input,
-    const int minibatch,
-    const int numFiltersInBlock,
-    scalar_t* output) {
-  assert(blockDim.x == SB);
-  const int tid = threadIdx.x;
-
-  // What is the id within a minibatch
-  const int filterIdx = blockIdx.x;
-  const int filterWeightIdx = blockIdx.y;
-
-  const int inputOffset = filterIdx * FS * minibatch * numFiltersInBlock +
-      filterWeightIdx * minibatch * numFiltersInBlock;
-  const float* tempInput = &input[inputOffset];
-
-  int readIndex = tid;
-
-  float sum = float(0.0);
-  while (readIndex < (minibatch * numFiltersInBlock)) {
-    sum += tempInput[readIndex];
-    readIndex += SB;
-  }
-
-  float temp = blockReduce(sum);
-
-  if (tid == 0) {
-    output[blockIdx.x * FS + blockIdx.y] = temp;
-  }
-}
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py
index 43f9be37b9..743e30a675 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py
@@ -2,7 +2,8 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-
+import torch
+import torch_npu
 
 class DynamicLossScaler(object):
     def __init__(
@@ -24,6 +25,7 @@ class DynamicLossScaler(object):
         self._last_rescale_iter = -1
         self._overflows_since_rescale = 0
         self.min_loss_scale = min_loss_scale
+        self.found_inf = torch.npu.FloatTensor([0.0])
 
     def scale(self, outputs):
         return self.loss_scale * outputs
@@ -41,7 +43,15 @@ class DynamicLossScaler(object):
 
     def check_overflow(self, grad_norm):
         # detect inf and nan
-        if grad_norm == float("inf") or grad_norm != grad_norm:
+        self.found_inf.fill_(0.0)
+        has_overflow = torch.npu.get_npu_overflow_flag()
+        if has_overflow:
+            self.found_inf.fill_(1)
+        if torch.distributed.is_initialized():
+            torch.distributed.all_reduce(self.found_inf,
+                                         op=torch.distributed.ReduceOp.MAX)
+        found_inf_flag = self.found_inf.item() > 0
+        if found_inf_flag:
             # overflow has occured
             prev_scale = self.loss_scale
             iter_since_rescale = self._iter - self._last_rescale_iter
@@ -68,3 +78,4 @@ class DynamicLossScaler(object):
 
             self._iter += 1
             raise OverflowError("setting loss scale to: " + str(self.loss_scale))
+
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py
index 2e61140dd8..83c7b0e216 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py
@@ -500,7 +500,7 @@ class SequenceGenerator(nn.Module):
             # from the list of {2 * beam_size} candidates were
             # selected. Shapes: (batch size, beam size)
             new_cands_to_ignore, active_hypos = torch.topk(
-                active_mask, k=beam_size, dim=1, largest=False
+                active_mask.float(), k=beam_size, dim=1, largest=False
             )
 
             # update cands_to_ignore to ignore any finalized hypos.
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py
index 8148c77fe1..a097e75148 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py
@@ -490,6 +490,7 @@ class FairseqTask(object):
         with torch.autograd.profiler.record_function("forward"):
             with torch.cuda.amp.autocast(enabled=(isinstance(optimizer, AMPOptimizer))):
                 loss, sample_size, logging_output = criterion(model, sample)
+        torch.npu.clear_npu_overflow_flag()
         if ignore_grad:
             loss *= 0
         with torch.autograd.profiler.record_function("backward"):
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py
index 5f20895c1f..5424171996 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py
@@ -56,7 +56,7 @@ class Trainer(object):
         self.tpu = cfg.common.tpu
         self.cuda = torch.cuda.is_available() and not cfg.common.cpu and not self.tpu
         if self.cuda:
-            self.device = torch.device("cuda")
+            self.device = torch.device("npu")
         elif self.tpu:
             self.device = utils.get_tpu_device()
         else:
@@ -302,11 +302,6 @@ class Trainer(object):
                 self.cfg, params, allow_unsupported=allow_unsupported
             )
         elif self.cfg.common.fp16 or self.cfg.common.bf16 or self.cfg.common.amp:
-            if self.cuda and torch.cuda.get_device_capability(0)[0] < 7:
-                logger.info(
-                    "NOTE: your device does NOT support faster training with --fp16 or --amp, "
-                    "please switch to FP32 which is likely to be faster"
-                )
             if (
                 self.cfg.common.memory_efficient_fp16
                 or self.cfg.common.memory_efficient_bf16
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py
index d1ec9a274c..96cfce1085 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py
@@ -737,8 +737,6 @@ class CudaEnvironment(object):
         cur_device = torch.cuda.current_device()
         prop = torch.cuda.get_device_properties("cuda:{}".format(cur_device))
         self.name = prop.name
-        self.major = prop.major
-        self.minor = prop.minor
         self.total_memory_in_GB = prop.total_memory / 1024 / 1024 / 1024
 
     @staticmethod
@@ -754,7 +752,6 @@ class CudaEnvironment(object):
         for r, env in enumerate(cuda_env_list):
             logger.info(
                 "rank {:3d}: ".format(r)
-                + "capabilities = {:2d}.{:<2d} ; ".format(env.major, env.minor)
                 + "total memory = {:.3f} GB ; ".format(env.total_memory_in_GB)
                 + "name = {:40s}".format(env.name)
             )
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py
index 30df9b34dd..8b93429c23 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py
@@ -47,6 +47,7 @@ from omegaconf import DictConfig, OmegaConf
 
 
 def main(cfg: FairseqConfig) -> None:
+    torch.npu.set_compile_mode(jit_compile=False)
     if isinstance(cfg, argparse.Namespace):
         cfg = convert_namespace_to_omegaconf(cfg)
 
@@ -167,8 +168,8 @@ def main(cfg: FairseqConfig) -> None:
 
     train_meter = meters.StopwatchMeter()
     train_meter.start()
-    logger.info(f"epoch_itr.next_epoch_itr: {epoch_itr.next_epoch_itr}")
     logger.info(f"max_epoch: {max_epoch}")
+    torch.npu.clear_npu_overflow_flag()
     while epoch_itr.next_epoch_idx <= max_epoch:
         if lr <= cfg.optimization.stop_min_lr:
             logger.info(
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/requirements.txt b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/requirements.txt
new file mode 100644
index 0000000000..71fa21ef3e
--- /dev/null
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/requirements.txt
@@ -0,0 +1,3 @@
+sacrebleu
+sacremoses
+scipy
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/env_npu.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/env_npu.sh
new file mode 100644
index 0000000000..4daa5da815
--- /dev/null
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/env_npu.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info'
+
+if [ -f $CANN_INSTALL_PATH_CONF ]; then
+    CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2)
+else
+    CANN_INSTALL_PATH="/usr/local/Ascend"
+fi
+
+if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then
+    source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh
+else
+    source ${CANN_INSTALL_PATH}/nnae/set_env.sh
+fi
+
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+export ASCEND_GLOBAL_LOG_LEVEL=3
+export ASCEND_GLOBAL_EVENT_ENABLE=0
+export TASK_QUEUE_ENABLE=1
+export HCCL_WHITELIST_DISABLE=1
+
+msnpureport -g error -d 0
+msnpureport -g error -d 1
+msnpureport -g error -d 2
+msnpureport -g error -d 3
+msnpureport -g error -d 4
+msnpureport -g error -d 5
+msnpureport -g error -d 6
+msnpureport -g error -d 7
+
+msnpureport -e disable
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_full_8p.sh
new file mode 100644
index 0000000000..d7e8add299
--- /dev/null
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_full_8p.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+Network="Deltalm_for_PyTorch"
+export RANK_SIZE=8
+export MASTER_ADDR=localhost
+export MASTER_PORT=29688
+data_path=""
+
+#训练epoch
+train_epochs=100
+#训练batch_size,,需要模型审视修改
+token_size=1024
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --epochs* ]];then
+        epochs=`echo ${para#*=}`
+    elif [[ $para == --batch_size* ]];then
+        token_size=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ]; then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+save_dir=${cur_path}/checkpoint
+
+#创建DeviceID输出目录，不需要修改
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/$ASCEND_DEVICE_ID ];then
+    rm -rf ${test_path_dir}/output/*
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+nohup python train.py $data_path \
+    --save-dir $save_dir \
+    --arch deltalm_base \
+    --pretrained-deltalm-checkpoint $data_path/deltalm-base.pt \
+    --share-all-embeddings \
+    --max-source-positions 512 --max-target-positions 512 \
+    --criterion label_smoothed_cross_entropy \
+    --label-smoothing 0.1 \
+    --optimizer adam --adam-betas '(0.9, 0.98)' \
+    --lr-scheduler inverse_sqrt \
+    --lr 1e-4 \
+    --warmup-init-lr 1e-07 \
+    --stop-min-lr 1e-09 \
+    --warmup-updates 4000 \
+    --max-update 400000 \
+    --max-epoch $train_epochs \
+    --max-tokens $token_size \
+    --update-freq 1 \
+    --seed 1 \
+    --log-format simple \
+    --skip-invalid-size-inputs-valid-test \
+    --fp16 \
+    --keep-last-epochs 2 \
+    --eval-bleu \
+    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
+    --eval-bleu-detok moses \
+    --eval-bleu-remove-bpe=sentencepiece \
+    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+WPS=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "wps=" '{print $NF}'|awk -F "wps" '{print $1}'|awk -F "," '{print $1}'|awk 'END {print}'`
+train_wall=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk 'NR==1{min=$1;next}{min=min<$1?min:$1}END{print min}'`
+TRAIN_WALL=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk -F "," '{print $1}'|tail -n  20|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g`
+
+echo "Final Performance words/sec : $WPS"
+echo "train_wall : $TRAIN_WALL"
+
+train_accuracy=`grep 'valid ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F "best_bleu" '{print $NF}' | awk 'END {print}' | sed s/[[:space:]]//g`
+#输出精度信息
+echo "Final Train Accuracy : ${train_accuracy}"
+echo "E2E Training Duration sec : $e2e_time"
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+TokenSize=${token_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${TokenSize}_${RANK_SIZE}'p'_'perf'
+#吞吐量
+ActualWPS=${WPS}
+##单迭代训练时长
+TrainingTime=${train_wall}
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep -r "loss=" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $13}' > ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TokenSize = ${TokenSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualWPS = ${ActualWPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainAccuracy = ${train_accuracy}">> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_1p.sh
new file mode 100644
index 0000000000..44830be322
--- /dev/null
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_1p.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+Network="Deltalm_for_PyTorch"
+export RANK_SIZE=1
+export MASTER_ADDR=localhost
+export MASTER_PORT=29688
+data_path=""
+
+#训练epoch
+train_epochs=1
+#训练batch_size,,需要模型审视修改
+token_size=1024
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --epochs* ]];then
+        epochs=`echo ${para#*=}`
+    elif [[ $para == --batch_size* ]];then
+        token_size=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ]; then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+save_dir=${cur_path}/checkpoint
+
+#创建DeviceID输出目录，不需要修改
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/$ASCEND_DEVICE_ID ];then
+    rm -rf ${test_path_dir}/output/*
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+nohup python train.py $data_path \
+    --save-dir $save_dir \
+    --arch deltalm_base \
+    --pretrained-deltalm-checkpoint $data_path/deltalm-base.pt \
+    --share-all-embeddings \
+    --max-source-positions 512 --max-target-positions 512 \
+    --criterion label_smoothed_cross_entropy \
+    --distributed-world-size 1 \
+    --distributed-num-procs 1 \
+    --distributed-no-spawn \
+    --distributed-backend hccl \
+    --label-smoothing 0.1 \
+    --optimizer adam --adam-betas '(0.9, 0.98)' \
+    --lr-scheduler inverse_sqrt \
+    --lr 1e-4 \
+    --warmup-init-lr 1e-07 \
+    --stop-min-lr 1e-09 \
+    --warmup-updates 4000 \
+    --max-update 400000 \
+    --max-epoch $train_epochs \
+    --max-tokens $token_size \
+    --update-freq 1 \
+    --seed 1 \
+    --log-format simple \
+    --skip-invalid-size-inputs-valid-test \
+    --fp16 \
+    --eval-bleu \
+    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
+    --eval-bleu-detok moses \
+    --eval-bleu-remove-bpe=sentencepiece \
+    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+WPS=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "wps=" '{print $NF}'|awk -F "wps" '{print $1}'|awk -F "," '{print $1}'|awk 'END {print}'`
+train_wall=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk 'NR==1{min=$1;next}{min=min<$1?min:$1}END{print min}'`
+TRAIN_WALL=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk -F "," '{print $1}'|tail -n  20|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g`
+
+echo "Final Performance words/sec : $WPS"
+echo "train_wall : $TRAIN_WALL"
+
+echo "E2E Training Duration sec : $e2e_time"
+
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+TokenSize=${token_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${TokenSize}_${RANK_SIZE}'p'_'perf'
+#吞吐量
+ActualWPS=${WPS}
+##单迭代训练时长
+TrainingTime=${train_wall}
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep -r "loss=" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $13}' > ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TokenSize = ${TokenSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualWPS = ${ActualWPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_8p.sh
new file mode 100644
index 0000000000..3320257e41
--- /dev/null
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_8p.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+
+Network="Deltalm_for_PyTorch"
+export RANK_SIZE=8
+export MASTER_ADDR=localhost
+export MASTER_PORT=29688
+data_path=""
+
+#训练epoch
+train_epochs=1
+#训练batch_size,,需要模型审视修改
+token_size=1024
+
+# 参数校验，data_path为必传参数，其他参数的增删由模型自身决定；此处新增参数需在上面有定义并赋值
+for para in $*
+do
+    if [[ $para == --data_path* ]];then
+        data_path=`echo ${para#*=}`
+    elif [[ $para == --epochs* ]];then
+        epochs=`echo ${para#*=}`
+    elif [[ $para == --batch_size* ]];then
+        token_size=`echo ${para#*=}`
+    fi
+done
+
+#校验是否传入data_path,不需要修改
+if [[ $data_path == "" ]];then
+    echo "[Error] para \"data_path\" must be confing"
+    exit 1
+fi
+
+###############指定训练脚本执行路径###############
+# cd到与test文件夹同层级目录下执行脚本，提高兼容性；test_path_dir为包含test文件夹的路径
+cur_path=`pwd`
+cur_path_last_dirname=${cur_path##*/}
+if [ x"${cur_path_last_dirname}" == x"test" ]; then
+    test_path_dir=${cur_path}
+    cd ..
+    cur_path=`pwd`
+else
+    test_path_dir=${cur_path}/test
+fi
+save_dir=${cur_path}/checkpoint
+
+#创建DeviceID输出目录，不需要修改
+ASCEND_DEVICE_ID=0
+if [ -d ${test_path_dir}/output/$ASCEND_DEVICE_ID ];then
+    rm -rf ${test_path_dir}/output/*
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+else
+    mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID
+fi
+
+
+#训练开始时间，不需要修改
+start_time=$(date +%s)
+# 非平台场景时source 环境变量
+check_etp_flag=`env | grep etp_running_flag`
+etp_flag=`echo ${check_etp_flag#*=}`
+if [ x"${etp_flag}" != x"true" ];then
+    source ${test_path_dir}/env_npu.sh
+fi
+
+nohup python train.py $data_path \
+    --save-dir $save_dir \
+    --arch deltalm_base \
+    --pretrained-deltalm-checkpoint $data_path/deltalm-base.pt \
+    --share-all-embeddings \
+    --max-source-positions 512 --max-target-positions 512 \
+    --criterion label_smoothed_cross_entropy \
+    --label-smoothing 0.1 \
+    --optimizer adam --adam-betas '(0.9, 0.98)' \
+    --lr-scheduler inverse_sqrt \
+    --lr 1e-4 \
+    --warmup-init-lr 1e-07 \
+    --stop-min-lr 1e-09 \
+    --warmup-updates 4000 \
+    --max-update 400000 \
+    --max-epoch $train_epochs \
+    --max-tokens $token_size \
+    --update-freq 1 \
+    --seed 1 \
+    --log-format simple \
+    --skip-invalid-size-inputs-valid-test \
+    --fp16 \
+    --eval-bleu \
+    --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \
+    --eval-bleu-detok moses \
+    --eval-bleu-remove-bpe=sentencepiece \
+    --best-checkpoint-metric bleu --maximize-best-checkpoint-metric > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 &
+
+wait
+#训练结束时间，不需要修改
+end_time=$(date +%s)
+e2e_time=$(( $end_time - $start_time ))
+
+#结果打印，不需要修改
+echo "------------------ Final result ------------------"
+#输出性能FPS，需要模型审视修改
+WPS=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "wps=" '{print $NF}'|awk -F "wps" '{print $1}'|awk -F "," '{print $1}'|awk 'END {print}'`
+train_wall=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk 'NR==1{min=$1;next}{min=min<$1?min:$1}END{print min}'`
+TRAIN_WALL=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk -F "," '{print $1}'|tail -n  20|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g`
+
+echo "Final Performance words/sec : $WPS"
+echo "train_wall : $TRAIN_WALL"
+
+echo "E2E Training Duration sec : $e2e_time"
+
+
+#性能看护结果汇总
+#训练用例信息，不需要修改
+TokenSize=${token_size}
+DeviceType=`uname -m`
+CaseName=${Network}_bs${TokenSize}_${RANK_SIZE}'p'_'perf'
+#吞吐量
+ActualWPS=${WPS}
+##单迭代训练时长
+TrainingTime=${train_wall}
+#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中，需要根据模型审视
+grep -r "loss=" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $13}' > ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt
+
+ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt`
+#关键信息打印到${CaseName}.log中，不需要修改
+echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TokenSize = ${TokenSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualWPS = ${ActualWPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
+echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/train.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/train.py
index 6ee12117d6..53f768dd78 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/train.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/train.py
@@ -1,7 +1,40 @@
+# BSD 3-Clause License#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i
+# ============================================================================
+import torch
+import torch_npu
+from torch_npu.contrib import transfer_to_npu
+from fairseq import fairseq
 import deltalm
-
 from fairseq_cli.train import cli_main
 
 
 if __name__ == "__main__":
-    cli_main()
\ No newline at end of file
+    cli_main()
-- 
Gitee


From b622681cbcc74e7263d0e914e44f1ff65801c4d4 Mon Sep 17 00:00:00 2001
From: wuxiankun <wuxiankun@huawei.com>
Date: Thu, 6 Jul 2023 13:58:59 +0800
Subject: [PATCH 2/2] add license for modified files

---
 .../nlp/Deltalm_for_PyTorch/auto-data.sh      | 30 ++++++++++++
 .../examples/prepare_iwslt14.sh               |  6 ++-
 .../fairseq/fairseq/modules/layer_norm.py     | 46 +++++++++++++++----
 .../fairseq/optim/dynamic_loss_scaler.py      | 30 ++++++++++++
 .../fairseq/fairseq/sequence_generator.py     | 30 ++++++++++++
 .../fairseq/fairseq/tasks/fairseq_task.py     | 30 ++++++++++++
 .../fairseq/fairseq/trainer.py                | 30 ++++++++++++
 .../fairseq/fairseq/utils.py                  | 30 ++++++++++++
 .../fairseq/fairseq_cli/train.py              | 30 ++++++++++++
 9 files changed, 252 insertions(+), 10 deletions(-)
 create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/auto-data.sh

diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/auto-data.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/auto-data.sh
new file mode 100644
index 0000000000..7ad4c0d168
--- /dev/null
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/auto-data.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+oridata_path=$1
+data_bin=$2
+spm_path=$3
+dict_path=$4
+mose_http=$5
+iws_url=$6
+bash examples/prepare_iwslt14.sh $oridata_path $mose_http $iws_url
+wait
+
+orill_dir=$oridata_path/iwslt14.tokenized.de-en
+spm_encode --model=$spm_path --output_format=piece < $orill_dir/train.de > train.spm.src
+spm_encode --model=$spm_path --output_format=piece < $orill_dir/train.en > train.spm.tgt
+spm_encode --model=$spm_path --output_format=piece < $orill_dir/valid.de > valid.spm.src
+spm_encode --model=$spm_path --output_format=piece < $orill_dir/valid.en > valid.spm.tgt
+spm_encode --model=$spm_path --output_format=piece < $orill_dir/test.de > test.spm.src
+spm_encode --model=$spm_path --output_format=piece < $orill_dir/test.en > test.spm.tgt
+
+wait
+python preprocess.py  \
+    --trainpref train.spm \
+    --validpref valid.spm \
+    --testpref test.spm \
+    --source-lang src --target-lang tgt \
+    --destdir $data_bin \
+    --srcdict $dict_path \
+    --tgtdict $dict_path \
+    --workers 40
+
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/examples/prepare_iwslt14.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/examples/prepare_iwslt14.sh
index ab1437d2fb..d745b30b2a 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/examples/prepare_iwslt14.sh
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/examples/prepare_iwslt14.sh
@@ -6,15 +6,17 @@ mkdir -p $1
 
 cd $1
 
+mose_http=$2
+iws_url=$3
 echo 'Cloning Moses github repository (for tokenization scripts)...'
-git clone https://github.com/moses-smt/mosesdecoder.git
+git clone $mose_http
 
 SCRIPTS=mosesdecoder/scripts
 TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
 LC=$SCRIPTS/tokenizer/lowercase.perl
 CLEAN=$SCRIPTS/training/clean-corpus-n.perl
 
-URL="http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz"
+URL=$iws_url
 GZ=de-en.tgz
 
 if [ ! -d "$SCRIPTS" ]; then
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/layer_norm.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/layer_norm.py
index 234609d9e2..85b5decef4 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/layer_norm.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/layer_norm.py
@@ -1,3 +1,33 @@
+# BSD 3-Clause License#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i
+# ============================================================================
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
@@ -27,14 +57,6 @@ except ImportError:
     has_fused_layernorm = False
 
 
-def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
-    if torch.jit.is_scripting():
-        export = True
-    if not export and torch.cuda.is_available() and has_fused_layernorm:
-        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
-    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
-
-
 class Fp32LayerNorm(nn.LayerNorm):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -48,3 +70,11 @@ class Fp32LayerNorm(nn.LayerNorm):
             self.eps,
         )
         return output.type_as(input)
+
+
+def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
+    if torch.jit.is_scripting():
+        export = True
+    if not export and torch.cuda.is_available() and has_fused_layernorm:
+        return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+    return Fp32LayerNorm(normalized_shape, eps, elementwise_affine)
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py
index 743e30a675..b4ef114175 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py
@@ -1,3 +1,33 @@
+# BSD 3-Clause License#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i
+# ============================================================================
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py
index 83c7b0e216..5615f15240 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py
@@ -1,3 +1,33 @@
+# BSD 3-Clause License#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i
+# ============================================================================
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py
index a097e75148..dacbba4774 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py
@@ -1,3 +1,33 @@
+# BSD 3-Clause License#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i
+# ============================================================================
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py
index 5424171996..f5b0fbd72c 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py
@@ -1,3 +1,33 @@
+# BSD 3-Clause License#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i
+# ============================================================================
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py
index 96cfce1085..6110417982 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py
@@ -1,3 +1,33 @@
+# BSD 3-Clause License#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i
+# ============================================================================
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py
index 8b93429c23..b79986e062 100644
--- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py
+++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py
@@ -1,4 +1,34 @@
 #!/usr/bin/env python3 -u
+# BSD 3-Clause License#
+# Copyright (c) 2017 xxxx
+# All rights reserved.
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i
+# ============================================================================
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
-- 
Gitee