From 0d6cb73036fb404c85e4a545ab3876bbccd65441 Mon Sep 17 00:00:00 2001 From: wuxiankun Date: Wed, 5 Jul 2023 13:32:50 +0800 Subject: [PATCH 1/2] add npu adapt code --- .../built-in/nlp/Deltalm_for_PyTorch/LICENSE | 30 ++ .../nlp/Deltalm_for_PyTorch/README.md | 342 +++++++-------- .../nlp/Deltalm_for_PyTorch/README_RAW.md | 184 ++++++++ .../cuda/ngram_repeat_block_cuda_kernel.cu | 82 ---- .../fairseq/clib/libnat_cuda/edit_dist.cu | 344 --------------- .../fairseq/fairseq/modules/cuda_utils.cu | 202 --------- .../dynamicconv_cuda_kernel.cu | 176 -------- .../lightconv_layer/lightconv_cuda_kernel.cu | 400 ------------------ .../fairseq/optim/dynamic_loss_scaler.py | 15 +- .../fairseq/fairseq/sequence_generator.py | 2 +- .../fairseq/fairseq/tasks/fairseq_task.py | 1 + .../fairseq/fairseq/trainer.py | 7 +- .../fairseq/fairseq/utils.py | 3 - .../fairseq/fairseq_cli/train.py | 3 +- .../nlp/Deltalm_for_PyTorch/requirements.txt | 3 + .../nlp/Deltalm_for_PyTorch/test/env_npu.sh | 31 ++ .../Deltalm_for_PyTorch/test/train_full_8p.sh | 136 ++++++ .../test/train_performance_1p.sh | 136 ++++++ .../test/train_performance_8p.sh | 132 ++++++ .../built-in/nlp/Deltalm_for_PyTorch/train.py | 37 +- 20 files changed, 864 insertions(+), 1402 deletions(-) create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/LICENSE create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/README_RAW.md delete mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu delete mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/libnat_cuda/edit_dist.cu delete mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/cuda_utils.cu delete mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu delete mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/requirements.txt create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/env_npu.sh create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_full_8p.sh create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_1p.sh create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_8p.sh diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/LICENSE b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/LICENSE new file mode 100644 index 0000000000..92a2f682ce --- /dev/null +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/LICENSE @@ -0,0 +1,30 @@ +BSD 3-Clause License + +Copyright (c) 2023, +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README.md b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README.md index a90733ad4d..a824d178e6 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README.md +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README.md @@ -1,184 +1,160 @@ -# [DeltaLM](https://arxiv.org/abs/2106.13736) +# Deltalm for PyTorch -**Encoder-Decoder Pre-training for Language Generation and Translation** - -[DeltaLM: Encoder-Decoder Pre-training for Language Generation and Translation by Augmenting Pretrained Multilingual Encoders.](https://arxiv.org/abs/2106.13736) Shuming Ma, Li Dong, Shaohan Huang, Dongdong Zhang, Alexandre Muzio, Saksham Singhal, Hany Hassan Awadalla, Xia Song, Furu Wei. CoRR abs/2106.13736. - -[mT6: Multilingual Pretrained Text-to-Text Transformer with Translation Pairs.](https://arxiv.org/abs/2104.08692) Zewen Chi, Li Dong, Shuming Ma, Shaohan Huang, Xian-Ling Mao, Heyan Huang, and Furu Wei. In EMNLP 2021. - -- September 2021: DeltaLM ranks first on the [WMT21 multilingual translation task](http://www.statmt.org/wmt21/large-scale-multilingual-translation-task.html). -- August 2021: release code and pretrained checkpoints. - ---- - -## Pretrained Models - -- [DeltaLM-base](https://deltalm.blob.core.windows.net/deltalm/deltalm-base.pt): #enc-dec=12-6; #hidden=768; #head=12; #FFN=3072 (#parameters: 360M) -- [DeltaLM-large](https://deltalm.blob.core.windows.net/deltalm/deltalm-large.pt): #enc-dec=24-12; #hidden=1024; #head=16; #FFN=4096 (#parameters: 830M) -- [Vocabulary](https://deltalm.blob.core.windows.net/deltalm/dict.txt) and [Sentencepiece-model](https://deltalm.blob.core.windows.net/deltalm/spm.model) -- DeltaLM can be finetuned to support language generation and translation tasks for **100+ languages** - - -## Cross-lingual Abstractive Summarization - [Wikilingua](https://arxiv.org/abs/2010.03093) - -We evaluate DeltaLM on cross-lingual abstractive summarization benchmark. We report the results by averaging the numbers in different languages. - -| Model | #Params | ROUGE-1 | ROUGE-2 | ROUGE-L | -|-----------|-------------|-----------|-----------|-----------| -| [mBART](https://arxiv.org/abs/2001.08210) | 610M | 34.5 | 12.9 | **28.7** | -| [mT5](https://arxiv.org/abs/2010.11934) | 300M | 27.5 | 8.8 | 22.8 | -| [mT5](https://arxiv.org/abs/2010.11934) | 580M | 31.8 | 11.5 | 26.0 | -| DeltaLM | 360M | **35.3** | **13.4** | **28.7** | - - -## Setup - -```bash -git submodule update --init deltalm/fairseq -cd deltalm/ -pip install --editable fairseq/ -``` - -## Fine-tuning - -1. Organize the raw data in the following structure: -``` -. -+-- /path/to/data/ -| +-- train.src -| +-- train.tgt -| +-- valid.src -| +-- valid.tgt -``` - -*Examples (IWSLT14 German to English)*: -```bash -bash examples/prepare_iwslt14.sh /tmp/iwslt14 -``` - -2. Tokenize the data using [Sentencepiece](https://github.com/google/sentencepiece): - -```bash -spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < train.src > train.spm.src -spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < train.tgt > train.spm.tgt -spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < valid.src > valid.spm.src -spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < valid.tgt > valid.spm.tgt -spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < test.src > test.spm.src -spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < test.tgt > test.spm.tgt -``` - -*Examples (IWSLT14 German to English)*: -```bash -bash examples/binary_iwslt14.sh \ - /tmp/iwslt14/iwslt14.tokenized.de-en \ - /tmp/iwslt14/iwslt14.spm \ - /path/to/checkpoint/spm.model -``` - -3. Binary the data: - -```bash -data_bin=/path/to/data-bin/ -python preprocess.py \ - --trainpref train.spm \ - --validpref valid.spm \ - --testpref test.spm \ - --source-lang src --target-lang tgt \ - --destdir $data_bin \ - --srcdict /path/to/checkpoint/dict.txt \ - --tgtdict /path/to/checkpoint/dict.txt \ - --workers 40 -``` - -*Examples (IWSLT14 German to English)*: -```bash -bash examples/binary_iwslt14.sh \ - /tmp/iwslt14/iwslt14.spm \ - /tmp/iwslt14/iwslt14.bin \ - /path/to/checkpoint/dict.txt -``` - -4. Fine-tuning: - -```bash -PRETRAINED_MODEL=/path/to/checkpoint/model.pt -python train.py $data_bin \ - --save-dir $save_dir \ - --arch deltalm_base \ - --pretrained-deltalm-checkpoint $PRETRAINED_MODEL \ - --share-all-embeddings \ - --max-source-positions 512 --max-target-positions 512 \ - --criterion label_smoothed_cross_entropy \ - --label-smoothing 0.1 \ - --optimizer adam --adam-betas '(0.9, 0.98)' \ - --lr-scheduler inverse_sqrt \ - --lr $lr \ - --warmup-init-lr 1e-07 \ - --stop-min-lr 1e-09 \ - --warmup-updates 4000 \ - --max-update 400000 \ - --max-epoch 100 \ - --max-tokens $batch_size \ - --update-freq 1 \ - --seed 1 \ - --log-format simple \ - --skip-invalid-size-inputs-valid-test -``` -**Note: -- For large checkpoint, please set `--arch deltalm_large`. -- Please adjust the `max-tokens` and `update-freq` to suit in different experimental environments. Recommendation of the total batch size is `4096 * 128` tokens per step. -- Use `--fp16` for more efficient training on the devices that have Tensor Cores. - -*Examples (IWSLT14 German to English)*: -```bash -bash examples/train_iwslt14.sh \ - /tmp/iwslt14/iwslt14.bin \ - /tmp/iwslt14/checkpoints \ - /path/to/checkpoint/model.pt -``` - -5. Evaluation: - -```bash -python generate.py $data_bin \ - --path $save_dir/checkpoint_best.pt \ - --batch-size 128 --beam 5 --remove-bpe=sentencepiece -``` - -*Examples (IWSLT14 German to English)*: -```bash -bash examples/evaluate_iwslt14.sh \ - /tmp/iwslt14/iwslt14.bin \ - /tmp/iwslt14/checkpoints -``` - ---- - -## Citation - -If you find this repository useful, please consider citing our work: -``` -@article{deltalm, - title={{DeltaLM}: Encoder-Decoder Pre-training for Language Generation and Translation by Augmenting Pretrained Multilingual Encoders}, - author={Shuming Ma and Li Dong and Shaohan Huang and Dongdong Zhang and Alexandre Muzio and Saksham Singhal and Hany Hassan Awadalla and Xia Song and Furu Wei}, - year={2021}, - eprint={2106.13736}, - archivePrefix={arXiv}, - primaryClass={cs.CL} -} -``` - -## Acknowledgement - -This repository is built using the [Fairseq](https://github.com/pytorch/fairseq) repository. - -## License -This project is licensed under the license found in the LICENSE file in the root directory of this source tree. - -[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct) - -### Contact Information - -For help or issues using DeltaLM models, please submit a GitHub issue. - -For other communications related to DeltaLM, please contact Shuming Ma (`shumma@microsoft.com`), [Furu Wei](http://gitnlp.org/) (`fuwei@microsoft.com`). +- [概述](概述.md) + +- [准备训练环境](准备训练环境.md) + +- [开始训练](开始训练.md) + +- [训练结果展示](训练结果展示.md) + +- [版本说明](版本说明.md) + + +# 概述 + +## 简述 + +Deltalm 模型是Fairseq套件中基于Transformer结构的翻译模型,在iwslt14 de2en数据集上训练和评估。 + +- 参考实现: + + ``` + url=https://github.com/microsoft/unilm/blob/master/deltalm + commit_id=eb1cc35e63988b2fe8c1fae348012a57da096e43 + ``` + +- 适配昇腾 AI 处理器的实现: + + ``` + url=https://gitee.com/ascend/ModelZoo-PyTorch.git + code_path=PyTorch/built-in/nlp + ``` + + +# 准备训练环境 + +## 准备环境 + +- 当前模型支持的 PyTorch 版本和已知三方库依赖如下表所示。 + + **表 1** 版本支持表 + + | Torch_Version | 三方库依赖版本 | + | :--------: | :----------------------------------------------------------: | + | PyTorch 1.8 | - | + +- 环境准备指导。 + + 请参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。 + +- 安装套件。 + + 在模型源码包根目录下执行以下命令。 + ```bash + pip3.7 install -e ./fairseq + ``` + 安装相应库 + ``` + pip install -r requirements.txt + ``` + + +## 准备数据集 + +1. 获取数据集。 + + 1. 用户可参考源码GPU仓自行下载 `iwslt14` 数据集,并在预处理数据后,上传至到服务器任意目录中,如`/data-bin` + 2. 或者使用一键式处理工具`auto-data.sh`,需提前准备: + 1. tokenize模型:"https://deltalm.blob.core.windows.net/deltalm/spm.model" + 2. 准备数据词典:"https://deltalm.blob.core.windows.net/deltalm/dict.txt" + 3. 准备分词工具:参考"https://github.com/google/sentencepiece" readme操作安装`spm_encode ` + 4. 执行脚本`bash auto-data.sh $1 $2 $3 $4 $5 $6` + + $1:原始数据生成目录 `/tmp/iwslt14` + + $2:最终处理数据目录 `/data-bin` + + $3:tokenize模型路径 + + $4:词典路径 + + $5: 数据预处理工具下载链接: [mosesdecoder](https://github.com/moses-smt/mosesdecoder.git) + + $6: 原始数据下载链接: [iwslt14](http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz) + +2. 获取预训练模型 + 用户自行下载`deltalm-base`预训练模型权重,并放置于上面预处理数据目录下 +# 开始训练 + +## 训练模型 + +1. 进入源码包根目录。 + + ```bash + cd /${模型文件夹名称} + ``` + +2. 运行训练脚本。 + + 该模型支持单机单卡训练和单机8卡训练。 + + - 单机单卡训练 + + 启动单卡训练。 + + ```bash + bash ./test/train_performance_1p.sh --data_path=/data-bin # 单卡性能 + ``` + + - 单机8卡训练。 + + 启动8卡训练。 + + ```bash + bash ./test/train_full_8p.sh --data_path=/data-bin # 8卡精度 + bash ./test/train_performance_8p.sh --data_path=/data-bin # 8卡性能 + ``` + + --data_path参数填写数据集路径,需写到数据集的一级目录。 + + + 模型训练脚本参数说明如下。 + + ``` + 公共参数: + --data_path //数据集路径 + --arch //使用模型架构 + --save-dir //权重文件保存路径 + --max-epoch //重复迭代轮数 + --max-tokens //最大token大小 + --lr //学习率 + --optimizer //使用哪种优化器 + --eval-bleu //使用评估指标 + --distributed-world-size //是否进行分布式训练 + ``` + + 训练完成后,权重文件默认保存在当前路径的checkpoints目录下,test/out目录下并输出模型训练精度和性能信息。 + +# 训练结果展示 + +**表 3** en_de数据集训练结果展示表 + +| NAME | MODE | Bleu | WPS | Epochs | AMP_Type | Torch_Version | +| :---: |------|:-----:|:----:| :---: | :---: | :---: | +| 8p-竞品A | fp16 | 39.45 | 14401 | 100 | - | 1.8 | +| 8p-NPU | fp16 | 39.37 | 16214 | 100 | - | 1.8 | + +> **说明:** + >由于该模型默认开启二进制,所以在性能测试时,需要安装二进制包,安装方式参考《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》。 + + +# 版本说明 + +## 变更 + +2023.6.29:首次发布。 + +## FAQ + +无。 diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README_RAW.md b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README_RAW.md new file mode 100644 index 0000000000..a90733ad4d --- /dev/null +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/README_RAW.md @@ -0,0 +1,184 @@ +# [DeltaLM](https://arxiv.org/abs/2106.13736) + +**Encoder-Decoder Pre-training for Language Generation and Translation** + +[DeltaLM: Encoder-Decoder Pre-training for Language Generation and Translation by Augmenting Pretrained Multilingual Encoders.](https://arxiv.org/abs/2106.13736) Shuming Ma, Li Dong, Shaohan Huang, Dongdong Zhang, Alexandre Muzio, Saksham Singhal, Hany Hassan Awadalla, Xia Song, Furu Wei. CoRR abs/2106.13736. + +[mT6: Multilingual Pretrained Text-to-Text Transformer with Translation Pairs.](https://arxiv.org/abs/2104.08692) Zewen Chi, Li Dong, Shuming Ma, Shaohan Huang, Xian-Ling Mao, Heyan Huang, and Furu Wei. In EMNLP 2021. + +- September 2021: DeltaLM ranks first on the [WMT21 multilingual translation task](http://www.statmt.org/wmt21/large-scale-multilingual-translation-task.html). +- August 2021: release code and pretrained checkpoints. + +--- + +## Pretrained Models + +- [DeltaLM-base](https://deltalm.blob.core.windows.net/deltalm/deltalm-base.pt): #enc-dec=12-6; #hidden=768; #head=12; #FFN=3072 (#parameters: 360M) +- [DeltaLM-large](https://deltalm.blob.core.windows.net/deltalm/deltalm-large.pt): #enc-dec=24-12; #hidden=1024; #head=16; #FFN=4096 (#parameters: 830M) +- [Vocabulary](https://deltalm.blob.core.windows.net/deltalm/dict.txt) and [Sentencepiece-model](https://deltalm.blob.core.windows.net/deltalm/spm.model) +- DeltaLM can be finetuned to support language generation and translation tasks for **100+ languages** + + +## Cross-lingual Abstractive Summarization - [Wikilingua](https://arxiv.org/abs/2010.03093) + +We evaluate DeltaLM on cross-lingual abstractive summarization benchmark. We report the results by averaging the numbers in different languages. + +| Model | #Params | ROUGE-1 | ROUGE-2 | ROUGE-L | +|-----------|-------------|-----------|-----------|-----------| +| [mBART](https://arxiv.org/abs/2001.08210) | 610M | 34.5 | 12.9 | **28.7** | +| [mT5](https://arxiv.org/abs/2010.11934) | 300M | 27.5 | 8.8 | 22.8 | +| [mT5](https://arxiv.org/abs/2010.11934) | 580M | 31.8 | 11.5 | 26.0 | +| DeltaLM | 360M | **35.3** | **13.4** | **28.7** | + + +## Setup + +```bash +git submodule update --init deltalm/fairseq +cd deltalm/ +pip install --editable fairseq/ +``` + +## Fine-tuning + +1. Organize the raw data in the following structure: +``` +. ++-- /path/to/data/ +| +-- train.src +| +-- train.tgt +| +-- valid.src +| +-- valid.tgt +``` + +*Examples (IWSLT14 German to English)*: +```bash +bash examples/prepare_iwslt14.sh /tmp/iwslt14 +``` + +2. Tokenize the data using [Sentencepiece](https://github.com/google/sentencepiece): + +```bash +spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < train.src > train.spm.src +spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < train.tgt > train.spm.tgt +spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < valid.src > valid.spm.src +spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < valid.tgt > valid.spm.tgt +spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < test.src > test.spm.src +spm_encode --model=/path/to/checkpoint/spm.model --output_format=piece < test.tgt > test.spm.tgt +``` + +*Examples (IWSLT14 German to English)*: +```bash +bash examples/binary_iwslt14.sh \ + /tmp/iwslt14/iwslt14.tokenized.de-en \ + /tmp/iwslt14/iwslt14.spm \ + /path/to/checkpoint/spm.model +``` + +3. Binary the data: + +```bash +data_bin=/path/to/data-bin/ +python preprocess.py \ + --trainpref train.spm \ + --validpref valid.spm \ + --testpref test.spm \ + --source-lang src --target-lang tgt \ + --destdir $data_bin \ + --srcdict /path/to/checkpoint/dict.txt \ + --tgtdict /path/to/checkpoint/dict.txt \ + --workers 40 +``` + +*Examples (IWSLT14 German to English)*: +```bash +bash examples/binary_iwslt14.sh \ + /tmp/iwslt14/iwslt14.spm \ + /tmp/iwslt14/iwslt14.bin \ + /path/to/checkpoint/dict.txt +``` + +4. Fine-tuning: + +```bash +PRETRAINED_MODEL=/path/to/checkpoint/model.pt +python train.py $data_bin \ + --save-dir $save_dir \ + --arch deltalm_base \ + --pretrained-deltalm-checkpoint $PRETRAINED_MODEL \ + --share-all-embeddings \ + --max-source-positions 512 --max-target-positions 512 \ + --criterion label_smoothed_cross_entropy \ + --label-smoothing 0.1 \ + --optimizer adam --adam-betas '(0.9, 0.98)' \ + --lr-scheduler inverse_sqrt \ + --lr $lr \ + --warmup-init-lr 1e-07 \ + --stop-min-lr 1e-09 \ + --warmup-updates 4000 \ + --max-update 400000 \ + --max-epoch 100 \ + --max-tokens $batch_size \ + --update-freq 1 \ + --seed 1 \ + --log-format simple \ + --skip-invalid-size-inputs-valid-test +``` +**Note: +- For large checkpoint, please set `--arch deltalm_large`. +- Please adjust the `max-tokens` and `update-freq` to suit in different experimental environments. Recommendation of the total batch size is `4096 * 128` tokens per step. +- Use `--fp16` for more efficient training on the devices that have Tensor Cores. + +*Examples (IWSLT14 German to English)*: +```bash +bash examples/train_iwslt14.sh \ + /tmp/iwslt14/iwslt14.bin \ + /tmp/iwslt14/checkpoints \ + /path/to/checkpoint/model.pt +``` + +5. Evaluation: + +```bash +python generate.py $data_bin \ + --path $save_dir/checkpoint_best.pt \ + --batch-size 128 --beam 5 --remove-bpe=sentencepiece +``` + +*Examples (IWSLT14 German to English)*: +```bash +bash examples/evaluate_iwslt14.sh \ + /tmp/iwslt14/iwslt14.bin \ + /tmp/iwslt14/checkpoints +``` + +--- + +## Citation + +If you find this repository useful, please consider citing our work: +``` +@article{deltalm, + title={{DeltaLM}: Encoder-Decoder Pre-training for Language Generation and Translation by Augmenting Pretrained Multilingual Encoders}, + author={Shuming Ma and Li Dong and Shaohan Huang and Dongdong Zhang and Alexandre Muzio and Saksham Singhal and Hany Hassan Awadalla and Xia Song and Furu Wei}, + year={2021}, + eprint={2106.13736}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +## Acknowledgement + +This repository is built using the [Fairseq](https://github.com/pytorch/fairseq) repository. + +## License +This project is licensed under the license found in the LICENSE file in the root directory of this source tree. + +[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct) + +### Contact Information + +For help or issues using DeltaLM models, please submit a GitHub issue. + +For other communications related to DeltaLM, please contact Shuming Ma (`shumma@microsoft.com`), [Furu Wei](http://gitnlp.org/) (`fuwei@microsoft.com`). diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu deleted file mode 100644 index bd6106cba0..0000000000 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu +++ /dev/null @@ -1,82 +0,0 @@ -/* -Copyright (c) Microsoft Corporation. -Licensed under the MIT License. -*/ - -/* -Kernel implementation for blocking repeated n-grams. -*/ - -#include -#include -#include -#include -#include - -// Ban repeated ngrams of length = 'no_repeat_ngram_size' -__global__ void banRepeatedTokens( - long* __restrict__ tokens, - float* __restrict__ lprobs, - int max_predict_len, - int vocab_size, - int no_repeat_ngram_size) { - auto row = blockIdx.x; - auto col = threadIdx.x; - auto start = row * (max_predict_len) + col; - // Each thread compares ngram starting from - // thread index with final ngram starting from - // step - no_repeat_ngram_size +2 - auto check_start_pos = blockDim.x; - auto lprob_start = row * vocab_size; - bool is_banned = true; - extern __shared__ long tokens_shm[]; - tokens_shm[col] = tokens[start]; - if (col == blockDim.x - 1) { - for (int i = 1; i < no_repeat_ngram_size; i++) { - if (col + i < max_predict_len) { - tokens_shm[col + i] = tokens[start + i]; - } - } - } - __syncthreads(); - - for (int k = 0; k < no_repeat_ngram_size - 1; k++) { - if (tokens_shm[col + k] != tokens_shm[check_start_pos + k]) { - is_banned = false; - } - } - if (is_banned == true) { - auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1]; - lprobs[lprob_start + token_to_be_banned] = -INFINITY; - } -} - -// Allocate blocks and threads based on -// batch size and sequence length and launch -// kernel -torch::Tensor ngram_repeat_block_cuda_forward( - const torch::Tensor tokens, - torch::Tensor lprobs, - int bsz, - int step, - int beam_size, - int no_repeat_ngram_size) { - int threads = step - no_repeat_ngram_size + 2; - if (threads <= 0) - return lprobs; - int max_predict_len = tokens.size(1); - int vocab_size = lprobs.size(1); - auto token_ptr = tokens.data_ptr(); - auto lprob_ptr = lprobs.data_ptr(); - int blocks = bsz * beam_size; - int shared_mem_size = (step + 1) * sizeof(long); - - // Launching N blocks where N is number of samples in a batch (beams*bsz) - // Launching T threads where T is number of previous ngrams in a sample - // Allocating shared mem per block for fastser access of input tokens since - // each token will be accessed N times to compare with current Ngram where - // N is Ngram size. - banRepeatedTokens<<>>( - token_ptr, lprob_ptr, max_predict_len, vocab_size, no_repeat_ngram_size); - return lprobs; -} diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/libnat_cuda/edit_dist.cu b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/libnat_cuda/edit_dist.cu deleted file mode 100644 index 96569d46c8..0000000000 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/clib/libnat_cuda/edit_dist.cu +++ /dev/null @@ -1,344 +0,0 @@ -/** - * Copyright 2017-present, Facebook, Inc. - * All rights reserved. - * - * This source code is licensed under the license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "edit_dist.h" - -#include -#include -#include -#include -#include // std::pair - -template -__global__ void generate_deletion_label_kernel( - const scalar_t* __restrict__ source, - const size_t source_size, - const size_t operation_size, - int* __restrict__ operations, - int* __restrict__ labels) { - const int index = blockIdx.x; - const int offset = index * operation_size; - const int offset_label = index * source_size; - - for (int i = 0; i < source_size; i++) { - labels[offset_label + i] = 0; - } - - int k = 0; - for (int i = 0; i < operation_size; i++) { - if (operations[offset + i] == 0) { - break; - } else if (operations[offset + i] == 1) { - continue; - } else { - labels[offset_label + k] = 3 - operations[offset + i]; - k++; - } - } -} - -template -__global__ void generate_insertion_label_kernel( - const scalar_t* __restrict__ target, - const size_t target_size, - const size_t operation_size, - int* __restrict__ operations, - int* __restrict__ labels, - int* __restrict__ masks) { - const int index = blockIdx.x; - const int offset = index * operation_size; - const int offset_label = index * target_size; - - int k = 0; - int u = 0; - int m = 0; - - for (int i = 0; i < target_size; i++) { - labels[offset_label + i] = 0; - masks[offset_label + i] = 0; - } - - for (int i = 0; i < operation_size - 1; i++) { - if (operations[offset + i] == 0) { - break; - } else if (operations[offset + i] == 2) { - continue; - } else if (operations[offset + i] == 1) { - masks[offset_label + m] = 1; - u++; - m++; - } else { - labels[offset_label + k] = u; - masks[offset_label + m] = 0; - k++; - m++; - u = 0; - } - } -} - -template -__global__ void levenshtein_distance_kernel( - const scalar_t* __restrict__ source, - const scalar_t* __restrict__ target, - const int* __restrict__ source_length, - const int* __restrict__ target_length, - const size_t source_size, - const size_t target_size, - int* __restrict__ operations, - int* __restrict__ errors_curr) { - const int index = blockIdx.x; - const int offset = index * (source_size + target_size); - const int d = index * (source_size + 1) * (target_size + 1); - const int t = target_size + 1; - - auto err_idx = [d, t](int i, int j) { return d + i * t + j; }; - auto opt_idx = [offset](int k) { return offset + k; }; - - const int hyp_len = source_length[index]; - const int ref_len = target_length[index]; - const scalar_t* hyp_begin = source + index * source_size; - const scalar_t* ref_begin = target + index * target_size; - - // dynamic programming - for (int i = 0; i <= hyp_len; i++) { - errors_curr[err_idx(i, 0)] = i; - } - for (int j = 0; j <= ref_len; j++) { - errors_curr[err_idx(0, j)] = j; - } - for (int i = 1; i <= hyp_len; i++) { - for (int j = 1; j <= ref_len; j++) { - errors_curr[err_idx(i, j)] = min( - min(errors_curr[err_idx(i - 1, j)], errors_curr[err_idx(i, j - 1)]) + - 1, - errors_curr[err_idx(i - 1, j - 1)] + - 2 * (*(hyp_begin + i - 1) == *(ref_begin + j - 1) ? 0 : 1)); - } - } - - // back-tracing - int i = hyp_len; - int j = ref_len; - int o = hyp_len + ref_len; - - for (int k = 0; k < source_size + target_size; k++) { - operations[opt_idx(k)] = 0; - } - - while ((i >= 0) && (j >= 0)) { - if ((i == 0) && (j == 0)) { - break; - } - - if ((j > 0) && - (errors_curr[err_idx(i, j - 1)] < errors_curr[err_idx(i, j)])) { - o--; - operations[opt_idx(o)] = 1; - j--; // insertion - } else if ( - (i > 0) && - (errors_curr[err_idx(i - 1, j)] < errors_curr[err_idx(i, j)])) { - o--; - operations[opt_idx(o)] = 2; - i--; // deletion - } else { - o--; - operations[opt_idx(o)] = 3; - i--; - j--; // do nothing - } - } - - // moving to the left - for (int k = 0; k < hyp_len + ref_len; k++) { - if (k + o < hyp_len + ref_len) { - operations[opt_idx(k)] = operations[opt_idx(k + o)]; - } else { - operations[opt_idx(k)] = 0; // padding - } - } -} - -template -__global__ void faster_levenshtein_distance_kernel( - const scalar_t* __restrict__ source, - const scalar_t* __restrict__ target, - const int* __restrict__ source_length, - const int* __restrict__ target_length, - const size_t source_size, - const size_t target_size, - int* __restrict__ operations) { - extern __shared__ short errors[]; - auto errors_curr = errors; - - const int index = blockIdx.x; - const int offset = index * (source_size + target_size); - const int t = target_size + 1; - - auto err_idx = [t](int i, int j) { return i * t + j; }; - auto opt_idx = [offset](int k) { return offset + k; }; - - const int hyp_len = source_length[index]; - const int ref_len = target_length[index]; - const scalar_t* hyp_begin = source + index * source_size; - const scalar_t* ref_begin = target + index * target_size; - - // dynamic programming - for (int i = 0; i <= hyp_len; i++) { - errors_curr[err_idx(i, 0)] = i; - } - for (int j = 0; j <= ref_len; j++) { - errors_curr[err_idx(0, j)] = j; - } - for (int i = 1; i <= hyp_len; i++) { - for (int j = 1; j <= ref_len; j++) { - errors_curr[err_idx(i, j)] = min( - min(errors_curr[err_idx(i - 1, j)], errors_curr[err_idx(i, j - 1)]) + - 1, - errors_curr[err_idx(i - 1, j - 1)] + - 2 * (*(hyp_begin + i - 1) == *(ref_begin + j - 1) ? 0 : 1)); - } - } - - // back-tracing - int i = hyp_len; - int j = ref_len; - int o = hyp_len + ref_len; - - for (int k = 0; k < source_size + target_size; k++) { - operations[opt_idx(k)] = 0; - } - - while ((i >= 0) && (j >= 0)) { - if ((i == 0) && (j == 0)) { - break; - } - - if ((j > 0) && - (errors_curr[err_idx(i, j - 1)] < errors_curr[err_idx(i, j)])) { - o--; - operations[opt_idx(o)] = 1; - j--; // insertion - } else if ( - (i > 0) && - (errors_curr[err_idx(i - 1, j)] < errors_curr[err_idx(i, j)])) { - o--; - operations[opt_idx(o)] = 2; - i--; // deletion - } else { - o--; - operations[opt_idx(o)] = 3; - i--; - j--; // do nothing - } - } - - // moving to the left - for (int k = 0; k < hyp_len + ref_len; k++) { - if (k + o < hyp_len + ref_len) { - operations[opt_idx(k)] = operations[opt_idx(k + o)]; - } else { - operations[opt_idx(k)] = 0; // padding - } - } -} - -torch::Tensor GenerateDeletionLabelCuda( - torch::Tensor source, - torch::Tensor operations) { - const auto batch_size = source.size(0); - at::TensorOptions options(source.device()); - options = options.dtype(at::ScalarType::Int); - auto labels = torch::empty({batch_size, source.size(1)}, options); - auto stream = at::cuda::getCurrentCUDAStream(source.device().index()); - - AT_DISPATCH_ALL_TYPES(source.scalar_type(), "generate_deletion_labels", ([&] { - generate_deletion_label_kernel - <<>>( - source.data_ptr(), - source.size(1), - operations.size(1), - operations.data_ptr(), - labels.data_ptr()); - })); - - return labels; -} - -std::pair GenerateInsertionLabelCuda( - torch::Tensor target, - torch::Tensor operations) { - const auto batch_size = target.size(0); - at::TensorOptions options(target.device()); - options = options.dtype(at::ScalarType::Int); - auto labels = torch::empty({batch_size, target.size(1)}, options); - auto masks = torch::empty({batch_size, target.size(1)}, options); - auto stream = at::cuda::getCurrentCUDAStream(target.device().index()); - - AT_DISPATCH_ALL_TYPES( - target.scalar_type(), "generate_insertion_labels", ([&] { - generate_insertion_label_kernel<<>>( - target.data_ptr(), - target.size(1), - operations.size(1), - operations.data_ptr(), - labels.data_ptr(), - masks.data_ptr()); - })); - - return std::make_pair(labels, masks); -} - -torch::Tensor LevenshteinDistanceCuda( - torch::Tensor source, - torch::Tensor target, - torch::Tensor source_length, - torch::Tensor target_length) { - const auto batch_size = source.size(0); - const auto shared_size = - (source.size(1) + 1) * (target.size(1) + 1) * sizeof(short); - - at::TensorOptions options(source.device()); - options = options.dtype(at::ScalarType::Int); - auto operations = - torch::empty({batch_size, source.size(1) + target.size(1)}, options); - auto stream = at::cuda::getCurrentCUDAStream(source.device().index()); - - if (shared_size > 40000) { - auto distances = torch::empty( - {batch_size, (source.size(1) + 1) * (target.size(1) + 1)}, options); - AT_DISPATCH_ALL_TYPES(source.scalar_type(), "levenshtein_distance", ([&] { - levenshtein_distance_kernel - <<>>( - source.data_ptr(), - target.data_ptr(), - source_length.data_ptr(), - target_length.data_ptr(), - source.size(1), - target.size(1), - operations.data_ptr(), - distances.data_ptr()); - })); - } else { - AT_DISPATCH_ALL_TYPES( - source.scalar_type(), "faster_levenshtein_distance", ([&] { - faster_levenshtein_distance_kernel - <<>>( - source.data_ptr(), - target.data_ptr(), - source_length.data_ptr(), - target_length.data_ptr(), - source.size(1), - target.size(1), - operations.data_ptr()); - })); - } - - return operations; -} diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/cuda_utils.cu b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/cuda_utils.cu deleted file mode 100644 index 924f852758..0000000000 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/cuda_utils.cu +++ /dev/null @@ -1,202 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -template -constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { - return (a + b - 1) / b; -} - -template -__inline__ __device__ void zeroSharedMem(scalar_t* data) { - /* - Given an array of length FS + SB, zero out the first padding_l and last - (FS - padding_l) values in the array - */ - - int tid = threadIdx.x; - - if (FS < SB) { - // zero all if we have enough threads in a block to do all of them - if (tid < padding_l || tid > SB - FS + padding_l - 1) { - data[tid] = scalar_t(0.0); - } - } else { - // otherwise zero out one block at a time - const int numIterations = divUp(FS, SB); - for (int i = 0; i < numIterations; i++) { - int offset = i * SB; - if (tid + offset < padding_l) { - data[tid + offset] = scalar_t(0.0); - } else if (tid + offset < FS) { - data[SB + tid + offset] = scalar_t(0.0); - } - } - } -} - -template -__inline__ __device__ scalar_t warpReduce(scalar_t data) { - /* - Reduce an array within each warp. After processing all values in warp will - caontain the sum of all original values in that warp. - - data - pointer to data to reduce - */ - data += __shfl_xor_sync(SHFL_MASK, data, 16); - data += __shfl_xor_sync(SHFL_MASK, data, 8); - data += __shfl_xor_sync(SHFL_MASK, data, 4); - data += __shfl_xor_sync(SHFL_MASK, data, 2); - data += __shfl_xor_sync(SHFL_MASK, data, 1); - return data; -} - -template -__inline__ __device__ scalar_t blockReduce(scalar_t data) { - /* - Reduce an entire array on the block level. After processing, the - first value in the array will contain the reduced sum. - - data - pointer to data to reduce - */ - - static __shared__ scalar_t warpSum[32]; - const int tid = threadIdx.x; - int wid = tid / 32; - int lane = tid % 32; - - __syncthreads(); - - // reduce each warp then write to shared memory - scalar_t sum = warpReduce(data); - if (lane == 0) { - warpSum[wid] = sum; - } - - __syncthreads(); - - scalar_t v; - // perform final sum of partial warp sums - if (tid < blockDim.x / 32) { - v = warpSum[lane]; - } else { - v = scalar_t(0.0); - } - - if (wid == 0) { - v = warpReduce(v); - } - __syncthreads(); - - return v; -} - -void checkCudaStatus(cudaError_t status, int lineNumber = -1) { - if (status != cudaSuccess) { - std::cout << cudaGetErrorString(status) << " at line " << lineNumber - << std::endl; - std::cout << "Exiting" << std::endl; - exit(1); - } -} - -template -__device__ void load_input_to_shared( - const scalar_t* input, // global memory - int inputOffset, - int sequenceLength, - int iteration, - int numIterations, - bool no_prev, - scalar_t* output /* shared memory */) { - /* - Load a block size of input into shared memory with - right and left overhang of total size FS. If previously - loaded memory, overlap will be shifted over to reduce - global memory access - - input - pointer to start of channel sequence - inputOffset - how far in the sequence to start loading - sequenceLength - total length of sequence - iteration - which block of sequence we are loading - numIterations - total number of blocks to load - no_prev - whether to load the whole block if the previous block - wasn't loaded - output - shared memory to write input to - */ - - const int tid = threadIdx.x; - - // Load the left "overhang" of input - if (iteration > 0) { - if (padding_l < SB) { - // load all at once - if (tid < padding_l) { - output[tid] = - (no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB]; - } - } else { - // load in chunks of size SB - int numIterations = divUp(padding_l, SB); - for (int i = 0; i < numIterations; i++) { - int offset = i * SB; - if ((tid + offset) < padding_l) { - output[tid + offset] = (no_prev) - ? input[inputOffset - padding_l + tid + offset] - : output[tid + offset + SB]; - } - } - } - } - - // Load the right "overhang" of input - if (iteration < (numIterations - 1)) { - const int elementsLeft = sequenceLength - (iteration + 1) * SB; - - if ((FS - padding_l) < SB) { - // load all at once - if (tid < (FS - padding_l)) { - output[padding_l + SB + tid] = (tid < elementsLeft) - ? input[inputOffset + SB + tid] - : scalar_t(0.0); - } - } else { - // load in chunks of size SB - int numIterations = divUp(FS - padding_l, SB); - for (int i = 0; i < numIterations; i++) { - int offset = i * SB; - if ((tid + offset) < (FS - padding_l)) { - output[padding_l + SB + tid + offset] = - ((tid + offset) < elementsLeft) - ? input[inputOffset + SB + tid + offset] - : scalar_t(0.0); - } - } - } - } - - // We should also clear out the right "overhang" - if (iteration == (numIterations - 1)) { - if ((FS - padding_l) < SB) { - // clear out all at once - if (tid < (FS - padding_l)) { - output[padding_l + SB + tid] = scalar_t(0.0); - } - } else { - // clear in chunks of size SB - int numIterations = divUp(FS - padding_l, SB); - for (int i = 0; i < numIterations; i++) { - int offset = i * SB; - if ((tid + offset) < (FS - padding_l)) { - output[padding_l + SB + tid + offset] = scalar_t(0.0); - } - } - } - } - output[tid + padding_l] = ((inputOffset + tid) < sequenceLength) - ? input[inputOffset + tid] - : scalar_t(0.0); -} diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu deleted file mode 100644 index 4630f1e982..0000000000 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu +++ /dev/null @@ -1,176 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "../cuda_utils.cu" -#include "dynamicconv_cuda.cuh" -#include "dynamicconv_cuda_backward.cu" -#include "dynamicconv_cuda_forward.cu" - -// FS is filter size and kernels are specialized for filter sizes -template -__global__ void dynamicconv_forward_kernel( - const scalar_t* input, - const scalar_t* weight, - int minibatch, - int sequenceLength, - int numFeatures, - int numFiltersInBlock, - int numHeads, - scalar_t* output) { - assert(blockDim.x == SB); - - const int tid = threadIdx.x; - const int batchIdx = blockIdx.x; - const int featureIdx = blockIdx.y; - const int head = featureIdx / numFiltersInBlock; - - const int IOOffset = - batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength; - const scalar_t* inputFeature = &input[IOOffset]; - scalar_t* outputFeature = &output[IOOffset]; - - scalar_t filter[FS]; - - __shared__ scalar_t tempInput[SB + FS]; - zeroSharedMem(tempInput); - - const int numIterations = divUp(sequenceLength, SB); - - for (int i = 0; i < numIterations; ++i) { - __syncthreads(); - const int inputOffset = i * SB; - load_input_to_shared( - inputFeature, - inputOffset, - sequenceLength, - i, - numIterations, - false, - tempInput); - __syncthreads(); - if (inputOffset + tid < sequenceLength) { -#pragma unroll - for (int k = 0; k < FS; ++k) { - const int filterOffset = batchIdx * numHeads * FS * sequenceLength + - head * FS * sequenceLength + k * sequenceLength + i * SB + tid; - filter[k] = weight[filterOffset]; - } - - scalar_t out = scalar_t(0.0); -#pragma unroll - for (int k = 0; k < FS; ++k) { - out += filter[k] * tempInput[tid + k]; - } - - outputFeature[inputOffset + tid] = out; - } - } -} - -template -__global__ void dynamicconv_backward_kernel( - const scalar_t* gradOutput, // B * C * T - const scalar_t* input, // B * C * T - const scalar_t* weight, - int minibatch, - int sequenceLength, - int numFeatures, - int numFiltersInBlock, - int numHeads, - scalar_t* gradWeight, - scalar_t* gradInput) { // B * H * k * T - - assert(blockDim.x == SB); - - // each block operates on a single batch and filter head - const int tid = threadIdx.x; - const int batchIdx = blockIdx.x; - const int headIdx = blockIdx.y; - const int chunkIdx = blockIdx.z; - - const int numChunks = divUp(sequenceLength, SB); - const int inputOffset = chunkIdx * SB; - - // initialize shared memory for output gradient and input - __shared__ scalar_t tempGradOutput[SB + FS]; - __shared__ scalar_t tempInput[SB + FS]; - const int padding = FS - padding_l - 1; - - zeroSharedMem(tempGradOutput); - zeroSharedMem(tempInput); - - // initialize local filter and weight gradient sum arrays - scalar_t tempGradSum[FS]; - scalar_t bfilter[FS]; - for (int k = 0; k < FS; ++k) { - tempGradSum[k] = scalar_t(0.0); - - int idxOffset = inputOffset + tid + k - padding; - if (idxOffset >= 0 && idxOffset < sequenceLength) { - int bfilterOffset = batchIdx * numHeads * FS * sequenceLength + - headIdx * FS * sequenceLength + (FS - k - 1) * sequenceLength + - idxOffset; - bfilter[k] = weight[bfilterOffset]; - } else { - bfilter[k] = scalar_t(0.0); - } - } - - // iterate over filter block - for (int featureIdx = 0; featureIdx < numFiltersInBlock; ++featureIdx) { - __syncthreads(); - - // load input and output gradient for this channel and chunk - const int IOOffset = batchIdx * numFeatures * sequenceLength + - (headIdx * numFiltersInBlock + featureIdx) * sequenceLength; - const scalar_t* inputFeature = &input[IOOffset]; - const scalar_t* gradOutputFeature = &gradOutput[IOOffset]; - scalar_t* gradInputFeature = &gradInput[IOOffset]; - - load_input_to_shared( - gradOutputFeature, - inputOffset, - sequenceLength, - chunkIdx, - numChunks, - true, - tempGradOutput); - load_input_to_shared( - inputFeature, - inputOffset, - sequenceLength, - chunkIdx, - numChunks, - true, - tempInput); - __syncthreads(); - - // sum input and weight gradients - scalar_t out = scalar_t(0.0); -#pragma unroll - for (int k = 0; k < FS; ++k) { - tempGradSum[k] += tempInput[tid + k] * tempGradOutput[tid + padding]; - out += bfilter[k] * tempGradOutput[tid + k]; - } - - if (inputOffset + tid < sequenceLength) { - gradInputFeature[inputOffset + tid] = out; - } - } - - const int gradOffset = - batchIdx * numHeads * FS * sequenceLength + headIdx * FS * sequenceLength; - scalar_t* gradWeightFeature = &gradWeight[gradOffset]; - - // write weight gradient - if (inputOffset + tid < sequenceLength) { - for (int k = 0; k < FS; ++k) { - const int outputOffset = k * sequenceLength + inputOffset + tid; - gradWeightFeature[outputOffset] = tempGradSum[k]; - } - } -} diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu deleted file mode 100644 index cdf31d5d2d..0000000000 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu +++ /dev/null @@ -1,400 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "../cuda_utils.cu" -#include "lightconv_cuda.cuh" -#include "lightconv_cuda_backward.cu" -#include "lightconv_cuda_forward.cu" - -template -__global__ void lightconv_forward_kernel( - const scalar_t* input, - const scalar_t* filters, - int minibatch, - int sequenceLength, - int numFeatures, - int numFiltersInBlock, - scalar_t* output) { - const int tid = threadIdx.x; - const int batchIdx = blockIdx.x; - const int featureIdx = blockIdx.y; - const int filterIdx = featureIdx / numFiltersInBlock; - - const int IOOffset = - numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength; - const scalar_t* inputFeature = &input[IOOffset]; - scalar_t* outputFeature = &output[IOOffset]; - const scalar_t* inputFilter = &filters[filterIdx * FS]; - - assert(blockDim.x == SB); - - scalar_t filter[FS]; -#pragma unroll - for (int i = 0; i < FS; ++i) { - filter[i] = inputFilter[i]; - } - - __shared__ scalar_t temp[SB + FS]; - zeroSharedMem(temp); - - const int numIterations = divUp(sequenceLength, SB); - - for (int i = 0; i < numIterations; ++i) { - // Read input into shared memory - const int inputOffset = i * SB; - - load_input_to_shared( - inputFeature, - inputOffset, - sequenceLength, - i, - numIterations, - (numIterations == 1), - temp); - - __syncthreads(); - - scalar_t out = 0; -#pragma unroll - for (int j = 0; j < FS; ++j) { - out += filter[j] * temp[tid + j]; - } - - // Write output - const int outputOffset = inputOffset; - if ((outputOffset + tid) < sequenceLength) { - outputFeature[outputOffset + tid] = out; - } - - __syncthreads(); - } -} - -template -__global__ void lightconv_grad_wrt_input_kernel( - const scalar_t* input, - const scalar_t* filters, - int minibatch, - int sequenceLength, - int numFeatures, - int numFiltersInBlock, - scalar_t* output) { - // input grad kernel is similar to forward kernel - const int tid = threadIdx.x; - const int batchIdx = blockIdx.x; - const int featureIdx = blockIdx.y; - const int filterIdx = featureIdx / numFiltersInBlock; - - const int IOOffset = - numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength; - const scalar_t* inputFeature = &input[IOOffset]; - scalar_t* outputFeature = &output[IOOffset]; - const scalar_t* inputFilter = &filters[filterIdx * FS]; - - assert(blockDim.x == SB); - - scalar_t filter[FS]; - -// The only change is loading the filter in reverse -#pragma unroll - for (int i = 0; i < FS; ++i) { - filter[i] = inputFilter[FS - i - 1]; - } - - __shared__ scalar_t temp[SB + FS]; - const int padding = FS - padding_l - 1; - zeroSharedMem(temp); - - __syncthreads(); - - const int numIterations = divUp(sequenceLength, SB); - - for (int i = 0; i < numIterations; ++i) { - // Read input into shared memory - const int inputOffset = i * SB; - - load_input_to_shared( - inputFeature, - inputOffset, - sequenceLength, - i, - numIterations, - false, - temp); - - __syncthreads(); - - scalar_t out = 0; -#pragma unroll - for (int j = 0; j < FS; ++j) { - out += filter[j] * temp[tid + j]; - } - - // Write output - const int outputOffset = inputOffset; - if ((outputOffset + tid) < sequenceLength) { - outputFeature[outputOffset + tid] = out; - } - - __syncthreads(); - } -} - -// This is by far the most expensive kernel in terms of time taken. -// Can be 16x slower than the forward or grad_wrt_input when filter size is 31 -template -__global__ void lightconv_grad_wrt_weights_firstpass_short_kernel( - const scalar_t* input, - const scalar_t* gradInput, - int minibatch, - int sequenceLength, - int numFeatures, - int numFiltersInBlock, - int numHeads, - float* output) { - const int tid = threadIdx.x; - const int batchIdx = blockIdx.x; - const int filterIdx = blockIdx.y; - - const int numIterations = divUp(sequenceLength, SB); - - float* tempOutputGradWeight = &output[filterIdx * FS * minibatch]; - - assert(blockDim.x == SB); - - __shared__ scalar_t tempInput[SB + FS]; - __shared__ scalar_t tempGradInput[SB + FS]; - - // local weight accumulation - float accumWeights[FS]; - - // Initialize memory - for (int i = 0; i < FS; ++i) { - accumWeights[i] = float(0.0); - } - - // loop over each sequence within filterblock - for (int idxInFilterBlock = 0; idxInFilterBlock < numFiltersInBlock; - ++idxInFilterBlock) { - const int featureOffset = batchIdx * numFeatures * sequenceLength + - (filterIdx * numFiltersInBlock + idxInFilterBlock) * sequenceLength; - const scalar_t* inputFeature = &input[featureOffset]; - const scalar_t* gradInputFeature = &gradInput[featureOffset]; - - zeroSharedMem(tempInput); - zeroSharedMem(tempGradInput); - __syncthreads(); - - for (int i = 0; i < numIterations; ++i) { - const int inputOffset = i * SB; - - load_input_to_shared( - inputFeature, - inputOffset, - sequenceLength, - i, - numIterations, - false, - tempInput); - load_input_to_shared( - gradInputFeature, - inputOffset, - sequenceLength, - i, - numIterations, - false, - tempGradInput); - - __syncthreads(); - - const int gradIndex = (FS / 2) + tid; - scalar_t tempGrad = tempGradInput[gradIndex]; - -#pragma unroll - for (int j = 0; j < FS; j++) { - const int inputIndex = tid + j; - accumWeights[j] += tempInput[inputIndex] * tempGrad; - } - - __syncthreads(); - } - } - - // Row-major sum - for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) { - float temp; - if (tid < sequenceLength) { - temp = accumWeights[filterWeightIdx]; - } else { - temp = float(0.0); - } - - const int outputOffset = filterWeightIdx * minibatch + batchIdx; - - temp = blockReduce(temp); - - if (tid == 0) { - tempOutputGradWeight[outputOffset] = temp; - } - } -} - -template -__global__ void lightconv_grad_wrt_weights_secondpass_short_kernel( - const float* input, - const int minibatch, - const int numFiltersInBlock, - scalar_t* output) { - assert(blockDim.x == SB); - - const int tid = threadIdx.x; - - const int filterIdx = blockIdx.x; - const int filterWeightIdx = blockIdx.y; - - const int inputOffset = - filterIdx * FS * minibatch + filterWeightIdx * minibatch; - const float* tempInput = &input[inputOffset]; - - // read into shared memory for reduction - int readIndex = tid; - - float sum = 0.0; - while (readIndex < minibatch) { - sum += tempInput[readIndex]; - readIndex += SB; - } - - float temp = blockReduce(sum); - - if (tid == 0) { - output[blockIdx.x * FS + blockIdx.y] = temp; - } -} - -// This is by far the most expensive kernel in terms of time taken. -// Can be 16x slower than the forward or grad_wrt_input when filter size is 31 -template -__global__ void lightconv_grad_wrt_weights_firstpass_kernel( - const scalar_t* input, - const scalar_t* gradInput, - int minibatch, - int sequenceLength, - int numFeatures, - int numFiltersInBlock, - float* output) { - assert(blockDim.x == SB); - - const int tid = threadIdx.x; - const int batchIdx = blockIdx.x; - const int featureIdx = blockIdx.y; - const int filterIdx = featureIdx / numFiltersInBlock; - const int idxInFilterBlock = featureIdx % numFiltersInBlock; - - const int numIterations = divUp(sequenceLength, SB); - - float temp; - - __shared__ scalar_t tempInput[SB + FS]; - __shared__ scalar_t tempGradInput[SB + FS]; - zeroSharedMem(tempInput); - zeroSharedMem(tempGradInput); - __syncthreads(); - - float accumWeights[FS]; - - for (int i = 0; i < FS; ++i) { - accumWeights[i] = float(0.0); - } - - const int IOOffset = - batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength; - const scalar_t* inputFeature = &input[IOOffset]; - const scalar_t* gradInputFeature = &gradInput[IOOffset]; - float* tempOutputGradWeight = - &output[filterIdx * FS * minibatch * numFiltersInBlock]; - - for (int i = 0; i < numIterations; ++i) { - const int inputOffset = i * SB; - - load_input_to_shared( - inputFeature, - inputOffset, - sequenceLength, - i, - numIterations, - false, - tempInput); - load_input_to_shared( - gradInputFeature, - inputOffset, - sequenceLength, - i, - numIterations, - false, - tempGradInput); - __syncthreads(); - -#pragma unroll - for (int j = 0; j < FS; ++j) { - accumWeights[j] += tempInput[tid + j] * tempGradInput[tid + (FS / 2)]; - } - - __syncthreads(); - } - - // Row-major sum - for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) { - // Write to shared memory before reduction - if (tid < sequenceLength) { - temp = accumWeights[filterWeightIdx]; - } else { - temp = float(0.0); - } - - temp = blockReduce(temp); - - const int outputOffset = filterWeightIdx * minibatch * numFiltersInBlock + - batchIdx * numFiltersInBlock + idxInFilterBlock; - - if (tid == 0) { - tempOutputGradWeight[outputOffset] = temp; - } - } -} - -template -__global__ void lightconv_grad_wrt_weights_secondpass_kernel( - const float* input, - const int minibatch, - const int numFiltersInBlock, - scalar_t* output) { - assert(blockDim.x == SB); - const int tid = threadIdx.x; - - // What is the id within a minibatch - const int filterIdx = blockIdx.x; - const int filterWeightIdx = blockIdx.y; - - const int inputOffset = filterIdx * FS * minibatch * numFiltersInBlock + - filterWeightIdx * minibatch * numFiltersInBlock; - const float* tempInput = &input[inputOffset]; - - int readIndex = tid; - - float sum = float(0.0); - while (readIndex < (minibatch * numFiltersInBlock)) { - sum += tempInput[readIndex]; - readIndex += SB; - } - - float temp = blockReduce(sum); - - if (tid == 0) { - output[blockIdx.x * FS + blockIdx.y] = temp; - } -} diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py index 43f9be37b9..743e30a675 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py @@ -2,7 +2,8 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - +import torch +import torch_npu class DynamicLossScaler(object): def __init__( @@ -24,6 +25,7 @@ class DynamicLossScaler(object): self._last_rescale_iter = -1 self._overflows_since_rescale = 0 self.min_loss_scale = min_loss_scale + self.found_inf = torch.npu.FloatTensor([0.0]) def scale(self, outputs): return self.loss_scale * outputs @@ -41,7 +43,15 @@ class DynamicLossScaler(object): def check_overflow(self, grad_norm): # detect inf and nan - if grad_norm == float("inf") or grad_norm != grad_norm: + self.found_inf.fill_(0.0) + has_overflow = torch.npu.get_npu_overflow_flag() + if has_overflow: + self.found_inf.fill_(1) + if torch.distributed.is_initialized(): + torch.distributed.all_reduce(self.found_inf, + op=torch.distributed.ReduceOp.MAX) + found_inf_flag = self.found_inf.item() > 0 + if found_inf_flag: # overflow has occured prev_scale = self.loss_scale iter_since_rescale = self._iter - self._last_rescale_iter @@ -68,3 +78,4 @@ class DynamicLossScaler(object): self._iter += 1 raise OverflowError("setting loss scale to: " + str(self.loss_scale)) + diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py index 2e61140dd8..83c7b0e216 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py @@ -500,7 +500,7 @@ class SequenceGenerator(nn.Module): # from the list of {2 * beam_size} candidates were # selected. Shapes: (batch size, beam size) new_cands_to_ignore, active_hypos = torch.topk( - active_mask, k=beam_size, dim=1, largest=False + active_mask.float(), k=beam_size, dim=1, largest=False ) # update cands_to_ignore to ignore any finalized hypos. diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py index 8148c77fe1..a097e75148 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py @@ -490,6 +490,7 @@ class FairseqTask(object): with torch.autograd.profiler.record_function("forward"): with torch.cuda.amp.autocast(enabled=(isinstance(optimizer, AMPOptimizer))): loss, sample_size, logging_output = criterion(model, sample) + torch.npu.clear_npu_overflow_flag() if ignore_grad: loss *= 0 with torch.autograd.profiler.record_function("backward"): diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py index 5f20895c1f..5424171996 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py @@ -56,7 +56,7 @@ class Trainer(object): self.tpu = cfg.common.tpu self.cuda = torch.cuda.is_available() and not cfg.common.cpu and not self.tpu if self.cuda: - self.device = torch.device("cuda") + self.device = torch.device("npu") elif self.tpu: self.device = utils.get_tpu_device() else: @@ -302,11 +302,6 @@ class Trainer(object): self.cfg, params, allow_unsupported=allow_unsupported ) elif self.cfg.common.fp16 or self.cfg.common.bf16 or self.cfg.common.amp: - if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: - logger.info( - "NOTE: your device does NOT support faster training with --fp16 or --amp, " - "please switch to FP32 which is likely to be faster" - ) if ( self.cfg.common.memory_efficient_fp16 or self.cfg.common.memory_efficient_bf16 diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py index d1ec9a274c..96cfce1085 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py @@ -737,8 +737,6 @@ class CudaEnvironment(object): cur_device = torch.cuda.current_device() prop = torch.cuda.get_device_properties("cuda:{}".format(cur_device)) self.name = prop.name - self.major = prop.major - self.minor = prop.minor self.total_memory_in_GB = prop.total_memory / 1024 / 1024 / 1024 @staticmethod @@ -754,7 +752,6 @@ class CudaEnvironment(object): for r, env in enumerate(cuda_env_list): logger.info( "rank {:3d}: ".format(r) - + "capabilities = {:2d}.{:<2d} ; ".format(env.major, env.minor) + "total memory = {:.3f} GB ; ".format(env.total_memory_in_GB) + "name = {:40s}".format(env.name) ) diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py index 30df9b34dd..8b93429c23 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py @@ -47,6 +47,7 @@ from omegaconf import DictConfig, OmegaConf def main(cfg: FairseqConfig) -> None: + torch.npu.set_compile_mode(jit_compile=False) if isinstance(cfg, argparse.Namespace): cfg = convert_namespace_to_omegaconf(cfg) @@ -167,8 +168,8 @@ def main(cfg: FairseqConfig) -> None: train_meter = meters.StopwatchMeter() train_meter.start() - logger.info(f"epoch_itr.next_epoch_itr: {epoch_itr.next_epoch_itr}") logger.info(f"max_epoch: {max_epoch}") + torch.npu.clear_npu_overflow_flag() while epoch_itr.next_epoch_idx <= max_epoch: if lr <= cfg.optimization.stop_min_lr: logger.info( diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/requirements.txt b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/requirements.txt new file mode 100644 index 0000000000..71fa21ef3e --- /dev/null +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/requirements.txt @@ -0,0 +1,3 @@ +sacrebleu +sacremoses +scipy diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/env_npu.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/env_npu.sh new file mode 100644 index 0000000000..4daa5da815 --- /dev/null +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/env_npu.sh @@ -0,0 +1,31 @@ +#!/bin/bash +CANN_INSTALL_PATH_CONF='/etc/Ascend/ascend_cann_install.info' + +if [ -f $CANN_INSTALL_PATH_CONF ]; then + CANN_INSTALL_PATH=$(cat $CANN_INSTALL_PATH_CONF | grep Install_Path | cut -d "=" -f 2) +else + CANN_INSTALL_PATH="/usr/local/Ascend" +fi + +if [ -d ${CANN_INSTALL_PATH}/ascend-toolkit/latest ]; then + source ${CANN_INSTALL_PATH}/ascend-toolkit/set_env.sh +else + source ${CANN_INSTALL_PATH}/nnae/set_env.sh +fi + +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=3 +export ASCEND_GLOBAL_EVENT_ENABLE=0 +export TASK_QUEUE_ENABLE=1 +export HCCL_WHITELIST_DISABLE=1 + +msnpureport -g error -d 0 +msnpureport -g error -d 1 +msnpureport -g error -d 2 +msnpureport -g error -d 3 +msnpureport -g error -d 4 +msnpureport -g error -d 5 +msnpureport -g error -d 6 +msnpureport -g error -d 7 + +msnpureport -e disable diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_full_8p.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_full_8p.sh new file mode 100644 index 0000000000..d7e8add299 --- /dev/null +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_full_8p.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +Network="Deltalm_for_PyTorch" +export RANK_SIZE=8 +export MASTER_ADDR=localhost +export MASTER_PORT=29688 +data_path="" + +#训练epoch +train_epochs=100 +#训练batch_size,,需要模型审视修改 +token_size=1024 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --epochs* ]];then + epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + token_size=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ]; then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi +save_dir=${cur_path}/checkpoint + +#创建DeviceID输出目录,不需要修改 +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/$ASCEND_DEVICE_ID ];then + rm -rf ${test_path_dir}/output/* + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +nohup python train.py $data_path \ + --save-dir $save_dir \ + --arch deltalm_base \ + --pretrained-deltalm-checkpoint $data_path/deltalm-base.pt \ + --share-all-embeddings \ + --max-source-positions 512 --max-target-positions 512 \ + --criterion label_smoothed_cross_entropy \ + --label-smoothing 0.1 \ + --optimizer adam --adam-betas '(0.9, 0.98)' \ + --lr-scheduler inverse_sqrt \ + --lr 1e-4 \ + --warmup-init-lr 1e-07 \ + --stop-min-lr 1e-09 \ + --warmup-updates 4000 \ + --max-update 400000 \ + --max-epoch $train_epochs \ + --max-tokens $token_size \ + --update-freq 1 \ + --seed 1 \ + --log-format simple \ + --skip-invalid-size-inputs-valid-test \ + --fp16 \ + --keep-last-epochs 2 \ + --eval-bleu \ + --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ + --eval-bleu-detok moses \ + --eval-bleu-remove-bpe=sentencepiece \ + --best-checkpoint-metric bleu --maximize-best-checkpoint-metric > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +WPS=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "wps=" '{print $NF}'|awk -F "wps" '{print $1}'|awk -F "," '{print $1}'|awk 'END {print}'` +train_wall=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk 'NR==1{min=$1;next}{min=min<$1?min:$1}END{print min}'` +TRAIN_WALL=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk -F "," '{print $1}'|tail -n 20|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g` + +echo "Final Performance words/sec : $WPS" +echo "train_wall : $TRAIN_WALL" + +train_accuracy=`grep 'valid ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log| awk -F "best_bleu" '{print $NF}' | awk 'END {print}' | sed s/[[:space:]]//g` +#输出精度信息 +echo "Final Train Accuracy : ${train_accuracy}" +echo "E2E Training Duration sec : $e2e_time" + +#性能看护结果汇总 +#训练用例信息,不需要修改 +TokenSize=${token_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${TokenSize}_${RANK_SIZE}'p'_'perf' +#吞吐量 +ActualWPS=${WPS} +##单迭代训练时长 +TrainingTime=${train_wall} +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep -r "loss=" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $13}' > ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TokenSize = ${TokenSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualWPS = ${ActualWPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainAccuracy = ${train_accuracy}">> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_1p.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_1p.sh new file mode 100644 index 0000000000..44830be322 --- /dev/null +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_1p.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +Network="Deltalm_for_PyTorch" +export RANK_SIZE=1 +export MASTER_ADDR=localhost +export MASTER_PORT=29688 +data_path="" + +#训练epoch +train_epochs=1 +#训练batch_size,,需要模型审视修改 +token_size=1024 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --epochs* ]];then + epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + token_size=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ]; then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi +save_dir=${cur_path}/checkpoint + +#创建DeviceID输出目录,不需要修改 +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/$ASCEND_DEVICE_ID ];then + rm -rf ${test_path_dir}/output/* + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +nohup python train.py $data_path \ + --save-dir $save_dir \ + --arch deltalm_base \ + --pretrained-deltalm-checkpoint $data_path/deltalm-base.pt \ + --share-all-embeddings \ + --max-source-positions 512 --max-target-positions 512 \ + --criterion label_smoothed_cross_entropy \ + --distributed-world-size 1 \ + --distributed-num-procs 1 \ + --distributed-no-spawn \ + --distributed-backend hccl \ + --label-smoothing 0.1 \ + --optimizer adam --adam-betas '(0.9, 0.98)' \ + --lr-scheduler inverse_sqrt \ + --lr 1e-4 \ + --warmup-init-lr 1e-07 \ + --stop-min-lr 1e-09 \ + --warmup-updates 4000 \ + --max-update 400000 \ + --max-epoch $train_epochs \ + --max-tokens $token_size \ + --update-freq 1 \ + --seed 1 \ + --log-format simple \ + --skip-invalid-size-inputs-valid-test \ + --fp16 \ + --eval-bleu \ + --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ + --eval-bleu-detok moses \ + --eval-bleu-remove-bpe=sentencepiece \ + --best-checkpoint-metric bleu --maximize-best-checkpoint-metric > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +WPS=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "wps=" '{print $NF}'|awk -F "wps" '{print $1}'|awk -F "," '{print $1}'|awk 'END {print}'` +train_wall=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk 'NR==1{min=$1;next}{min=min<$1?min:$1}END{print min}'` +TRAIN_WALL=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk -F "," '{print $1}'|tail -n 20|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g` + +echo "Final Performance words/sec : $WPS" +echo "train_wall : $TRAIN_WALL" + +echo "E2E Training Duration sec : $e2e_time" + + +#性能看护结果汇总 +#训练用例信息,不需要修改 +TokenSize=${token_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${TokenSize}_${RANK_SIZE}'p'_'perf' +#吞吐量 +ActualWPS=${WPS} +##单迭代训练时长 +TrainingTime=${train_wall} +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep -r "loss=" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $13}' > ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TokenSize = ${TokenSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualWPS = ${ActualWPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_8p.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_8p.sh new file mode 100644 index 0000000000..3320257e41 --- /dev/null +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/test/train_performance_8p.sh @@ -0,0 +1,132 @@ +#!/bin/bash + +Network="Deltalm_for_PyTorch" +export RANK_SIZE=8 +export MASTER_ADDR=localhost +export MASTER_PORT=29688 +data_path="" + +#训练epoch +train_epochs=1 +#训练batch_size,,需要模型审视修改 +token_size=1024 + +# 参数校验,data_path为必传参数,其他参数的增删由模型自身决定;此处新增参数需在上面有定义并赋值 +for para in $* +do + if [[ $para == --data_path* ]];then + data_path=`echo ${para#*=}` + elif [[ $para == --epochs* ]];then + epochs=`echo ${para#*=}` + elif [[ $para == --batch_size* ]];then + token_size=`echo ${para#*=}` + fi +done + +#校验是否传入data_path,不需要修改 +if [[ $data_path == "" ]];then + echo "[Error] para \"data_path\" must be confing" + exit 1 +fi + +###############指定训练脚本执行路径############### +# cd到与test文件夹同层级目录下执行脚本,提高兼容性;test_path_dir为包含test文件夹的路径 +cur_path=`pwd` +cur_path_last_dirname=${cur_path##*/} +if [ x"${cur_path_last_dirname}" == x"test" ]; then + test_path_dir=${cur_path} + cd .. + cur_path=`pwd` +else + test_path_dir=${cur_path}/test +fi +save_dir=${cur_path}/checkpoint + +#创建DeviceID输出目录,不需要修改 +ASCEND_DEVICE_ID=0 +if [ -d ${test_path_dir}/output/$ASCEND_DEVICE_ID ];then + rm -rf ${test_path_dir}/output/* + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +else + mkdir -p ${test_path_dir}/output/$ASCEND_DEVICE_ID +fi + + +#训练开始时间,不需要修改 +start_time=$(date +%s) +# 非平台场景时source 环境变量 +check_etp_flag=`env | grep etp_running_flag` +etp_flag=`echo ${check_etp_flag#*=}` +if [ x"${etp_flag}" != x"true" ];then + source ${test_path_dir}/env_npu.sh +fi + +nohup python train.py $data_path \ + --save-dir $save_dir \ + --arch deltalm_base \ + --pretrained-deltalm-checkpoint $data_path/deltalm-base.pt \ + --share-all-embeddings \ + --max-source-positions 512 --max-target-positions 512 \ + --criterion label_smoothed_cross_entropy \ + --label-smoothing 0.1 \ + --optimizer adam --adam-betas '(0.9, 0.98)' \ + --lr-scheduler inverse_sqrt \ + --lr 1e-4 \ + --warmup-init-lr 1e-07 \ + --stop-min-lr 1e-09 \ + --warmup-updates 4000 \ + --max-update 400000 \ + --max-epoch $train_epochs \ + --max-tokens $token_size \ + --update-freq 1 \ + --seed 1 \ + --log-format simple \ + --skip-invalid-size-inputs-valid-test \ + --fp16 \ + --eval-bleu \ + --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ + --eval-bleu-detok moses \ + --eval-bleu-remove-bpe=sentencepiece \ + --best-checkpoint-metric bleu --maximize-best-checkpoint-metric > ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log 2>&1 & + +wait +#训练结束时间,不需要修改 +end_time=$(date +%s) +e2e_time=$(( $end_time - $start_time )) + +#结果打印,不需要修改 +echo "------------------ Final result ------------------" +#输出性能FPS,需要模型审视修改 +WPS=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "wps=" '{print $NF}'|awk -F "wps" '{print $1}'|awk -F "," '{print $1}'|awk 'END {print}'` +train_wall=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk 'NR==1{min=$1;next}{min=min<$1?min:$1}END{print min}'` +TRAIN_WALL=`grep 'train_inner ' ${test_path_dir}/output/${ASCEND_DEVICE_ID}/train_${ASCEND_DEVICE_ID}.log|awk -F "train_wall=" '{print $NF}'|awk -F "," '{print $1}'|tail -n 20|awk '{sum+=$1} END {print"",sum/NR}'|sed s/[[:space:]]//g` + +echo "Final Performance words/sec : $WPS" +echo "train_wall : $TRAIN_WALL" + +echo "E2E Training Duration sec : $e2e_time" + + +#性能看护结果汇总 +#训练用例信息,不需要修改 +TokenSize=${token_size} +DeviceType=`uname -m` +CaseName=${Network}_bs${TokenSize}_${RANK_SIZE}'p'_'perf' +#吞吐量 +ActualWPS=${WPS} +##单迭代训练时长 +TrainingTime=${train_wall} +#从train_$ASCEND_DEVICE_ID.log提取Loss到train_${CaseName}_loss.txt中,需要根据模型审视 +grep -r "loss=" ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_$ASCEND_DEVICE_ID.log | awk '{print $13}' > ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt + +ActualLoss=`awk 'END {print}' ${test_path_dir}/output/$ASCEND_DEVICE_ID/train_${CaseName}_loss.txt` +#关键信息打印到${CaseName}.log中,不需要修改 +echo "Network = ${Network}" > ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "RankSize = ${RANK_SIZE}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TokenSize = ${TokenSize}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "DeviceType = ${DeviceType}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "CaseName = ${CaseName}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualWPS = ${ActualWPS}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "TrainingTime = ${TrainingTime}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "ActualLoss = ${ActualLoss}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log +echo "E2ETrainingTime = ${e2e_time}" >> ${test_path_dir}/output/$ASCEND_DEVICE_ID/${CaseName}.log diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/train.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/train.py index 6ee12117d6..53f768dd78 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/train.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/train.py @@ -1,7 +1,40 @@ +# BSD 3-Clause License# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i +# ============================================================================ +import torch +import torch_npu +from torch_npu.contrib import transfer_to_npu +from fairseq import fairseq import deltalm - from fairseq_cli.train import cli_main if __name__ == "__main__": - cli_main() \ No newline at end of file + cli_main() -- Gitee From b622681cbcc74e7263d0e914e44f1ff65801c4d4 Mon Sep 17 00:00:00 2001 From: wuxiankun Date: Thu, 6 Jul 2023 13:58:59 +0800 Subject: [PATCH 2/2] add license for modified files --- .../nlp/Deltalm_for_PyTorch/auto-data.sh | 30 ++++++++++++ .../examples/prepare_iwslt14.sh | 6 ++- .../fairseq/fairseq/modules/layer_norm.py | 46 +++++++++++++++---- .../fairseq/optim/dynamic_loss_scaler.py | 30 ++++++++++++ .../fairseq/fairseq/sequence_generator.py | 30 ++++++++++++ .../fairseq/fairseq/tasks/fairseq_task.py | 30 ++++++++++++ .../fairseq/fairseq/trainer.py | 30 ++++++++++++ .../fairseq/fairseq/utils.py | 30 ++++++++++++ .../fairseq/fairseq_cli/train.py | 30 ++++++++++++ 9 files changed, 252 insertions(+), 10 deletions(-) create mode 100644 PyTorch/built-in/nlp/Deltalm_for_PyTorch/auto-data.sh diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/auto-data.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/auto-data.sh new file mode 100644 index 0000000000..7ad4c0d168 --- /dev/null +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/auto-data.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +oridata_path=$1 +data_bin=$2 +spm_path=$3 +dict_path=$4 +mose_http=$5 +iws_url=$6 +bash examples/prepare_iwslt14.sh $oridata_path $mose_http $iws_url +wait + +orill_dir=$oridata_path/iwslt14.tokenized.de-en +spm_encode --model=$spm_path --output_format=piece < $orill_dir/train.de > train.spm.src +spm_encode --model=$spm_path --output_format=piece < $orill_dir/train.en > train.spm.tgt +spm_encode --model=$spm_path --output_format=piece < $orill_dir/valid.de > valid.spm.src +spm_encode --model=$spm_path --output_format=piece < $orill_dir/valid.en > valid.spm.tgt +spm_encode --model=$spm_path --output_format=piece < $orill_dir/test.de > test.spm.src +spm_encode --model=$spm_path --output_format=piece < $orill_dir/test.en > test.spm.tgt + +wait +python preprocess.py \ + --trainpref train.spm \ + --validpref valid.spm \ + --testpref test.spm \ + --source-lang src --target-lang tgt \ + --destdir $data_bin \ + --srcdict $dict_path \ + --tgtdict $dict_path \ + --workers 40 + diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/examples/prepare_iwslt14.sh b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/examples/prepare_iwslt14.sh index ab1437d2fb..d745b30b2a 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/examples/prepare_iwslt14.sh +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/examples/prepare_iwslt14.sh @@ -6,15 +6,17 @@ mkdir -p $1 cd $1 +mose_http=$2 +iws_url=$3 echo 'Cloning Moses github repository (for tokenization scripts)...' -git clone https://github.com/moses-smt/mosesdecoder.git +git clone $mose_http SCRIPTS=mosesdecoder/scripts TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl LC=$SCRIPTS/tokenizer/lowercase.perl CLEAN=$SCRIPTS/training/clean-corpus-n.perl -URL="http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz" +URL=$iws_url GZ=de-en.tgz if [ ! -d "$SCRIPTS" ]; then diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/layer_norm.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/layer_norm.py index 234609d9e2..85b5decef4 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/layer_norm.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/modules/layer_norm.py @@ -1,3 +1,33 @@ +# BSD 3-Clause License# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i +# ============================================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the @@ -27,14 +57,6 @@ except ImportError: has_fused_layernorm = False -def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False): - if torch.jit.is_scripting(): - export = True - if not export and torch.cuda.is_available() and has_fused_layernorm: - return FusedLayerNorm(normalized_shape, eps, elementwise_affine) - return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine) - - class Fp32LayerNorm(nn.LayerNorm): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -48,3 +70,11 @@ class Fp32LayerNorm(nn.LayerNorm): self.eps, ) return output.type_as(input) + + +def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False): + if torch.jit.is_scripting(): + export = True + if not export and torch.cuda.is_available() and has_fused_layernorm: + return FusedLayerNorm(normalized_shape, eps, elementwise_affine) + return Fp32LayerNorm(normalized_shape, eps, elementwise_affine) diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py index 743e30a675..b4ef114175 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/optim/dynamic_loss_scaler.py @@ -1,3 +1,33 @@ +# BSD 3-Clause License# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i +# ============================================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py index 83c7b0e216..5615f15240 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/sequence_generator.py @@ -1,3 +1,33 @@ +# BSD 3-Clause License# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i +# ============================================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py index a097e75148..dacbba4774 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/tasks/fairseq_task.py @@ -1,3 +1,33 @@ +# BSD 3-Clause License# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i +# ============================================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py index 5424171996..f5b0fbd72c 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/trainer.py @@ -1,3 +1,33 @@ +# BSD 3-Clause License# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i +# ============================================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py index 96cfce1085..6110417982 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq/utils.py @@ -1,3 +1,33 @@ +# BSD 3-Clause License# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i +# ============================================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the diff --git a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py index 8b93429c23..b79986e062 100644 --- a/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py +++ b/PyTorch/built-in/nlp/Deltalm_for_PyTorch/fairseq/fairseq_cli/train.py @@ -1,4 +1,34 @@ #!/usr/bin/env python3 -u +# BSD 3-Clause License# +# Copyright (c) 2017 xxxx +# All rights reserved. +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.i +# ============================================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the -- Gitee