diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/LICENSE b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..dffead34fdeed6037d65ebb08c6b67f754311ea7 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright Aohan Zeng + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/MODEL_LICENSE b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/MODEL_LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..d1eb47b011600605984e6bd7bda81a8bb1c27de6 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/MODEL_LICENSE @@ -0,0 +1,33 @@ +The GLM-130B License + +1. Definitions + +“Licensor” means the GLM-130B Model Team that distributes its Software. + +“Software” means the GLM-130B model parameters made available under this license. + +2. License Grant + +Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes. + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +3. Restriction + +You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes. + +You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings. + +4. Disclaimer + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +5. Limitation of Liability + +EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +6. Dispute Resolution + +This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing. + +Note that the license is subject to update to a more comprehensive version. For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com. diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/README.md b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd00a1287ed0a655f062498d9300e2621c0b361f --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/README.md @@ -0,0 +1,202 @@ + + +

+ 🌐 Blog • ⏬ Download Model • 🪧 Demo • ✉️ Email • 📃 Paper [ICLR 2023]
+

+ +

+ 💬 Google Group (Updates) or Wechat Group or Slack channel (Discussions) +

+ +# GLM-130B: An Open Bilingual Pre-Trained Model + +GLM-130B is an open bilingual (English & Chinese) bidirectional dense model with 130 billion parameters, pre-trained using the algorithm of [General Language Model (GLM)](https://aclanthology.org/2022.acl-long.26). It is designed to support inference tasks with the 130B parameters on **a single A100 (40G * 8)** or **V100 (32G * 8) server**. With INT4 quantization, the hardware requirements can further be reduced to **a single server with 4 * RTX 3090 (24G)** with **almost no performance degradation**. As of July 3rd, 2022, GLM-130B has been trained on over 400 billion text tokens (200B each for Chinese and English) and it has the following unique features: + +- **Bilingual:** supports both English and Chinese. +- **Performance (EN):** better than GPT-3 175B (+4.0%), OPT-175B (+5.5%), and BLOOM-176B (+13.0%) on LAMBADA and slightly better than GPT-3 175B (+0.9%) on MMLU. +- **Performance (CN):** significantly better than ERNIE TITAN 3.0 260B on 7 zero-shot CLUE datasets (+24.26%) and 5 zero-shot FewCLUE datasets (+12.75%). +- **Fast Inference:** supports fast inference on both [SAT](https://github.com/THUDM/SwissArmyTransformer) and [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) (up to 2.5X faster) with a single A100 server. +- **Reproducibility:** all results (30+ tasks) can be easily reproduced with open-sourced code and model checkpoints. +- **Cross-Platform:** supports training and inference on NVIDIA, Hygon DCU, Ascend 910, and Sunway (Will be released soon). + +This repository mainly focus on the evaluation of GLM-130B, the training part is open for research purposes, please send an email to glm-130b@googlegroups.com to apply for access. If you find our work and our open-sourced efforts useful, ⭐️ to encourage our following development! :) + +## News + +- **[2023.01.21]** GLM-130B has been accepted to [ICLR 2023](https://iclr.cc/Conferences/2023)! +- **[2022.10.06]** Our [paper](http://arxiv.org/abs/2210.02414) for GLM-130B is out! +- **[2022.08.24]** We are proud to publish the quantized version for GLM-130B. While preserving the activation precision as FP16, the model weights can be quantized to as low as **INT4 with almost no degradation of performance**, further reducing the hardware requirements of the GLM-130B to **a single server with 4 * RTX 3090 (24G)**! See [Quantization of GLM-130B](docs/quantization.md) for details. + +For smaller models, please find [monolingual GLMs](https://github.com/THUDM/GLM) (English: 10B/2B/515M/410M/335M/110M, Chinese: 10B/335M) and an [1B multilingual GLM](https://github.com/THUDM/Multilingual-GLM) (104 languages). + +## Getting Started + +### Environment Setup + +#### Hardware + +| **Hardware** | **GPU Memory** | **Quantization** | **Weight Offload** | +| --------------- | -------------- | ---------------- | ------------------ | +| 8 * A100 | 40 GB | No | No | +| 8 * V100 | 32 GB | No | Yes (BMInf) | +| 8 * V100 | 32 GB | INT8 | No | +| 8 * RTX 3090 | 24 GB | INT8 | No | +| 4 * RTX 3090 | 24 GB | INT4 | No | +| 8 * RTX 2080 Ti | 11 GB | INT4 | No | + +It is recommended to use the an A100 (40G * 8) server, as all GLM-130B evaluation results (~30 tasks) reported can be easily reproduced with a single A100 server in about half a day. With INT8/INT4 quantization, efficient inference on **a single server with 4 * RTX 3090 (24G)** is possible, see [Quantization of GLM-130B](docs/quantization.md) for details. Combining quantization and weight offloading techniques, GLM-130B can also be inferenced on servers with even smaller GPU memory, see [Low-Resource Inference](docs/low-resource-inference.md) for details. + +#### Software + +The GLM-130B code is built on the top of [SAT](https://github.com/THUDM/SwissArmyTransformer). We recommend using [Miniconda](https://docs.conda.io/en/latest/miniconda.html) to manage your environment and installing additional dependencies via `pip install -r requirements.txt`. Here are the recommended environment configurations: + +- Python 3.9+ / CUDA 11+ / PyTorch 1.10+ / DeepSpeed 0.6+ / Apex (**installation with CUDA and C++ extensions is required, see [here](https://github.com/NVIDIA/apex/#linux)**) +- SwissArmyTransformer>=0.2.11 is required for quantization + +#### Model weights + +Download the GLM-130B’s model checkpoint from [here](https://docs.google.com/forms/d/e/1FAIpQLSehr5Dh_i3TwACmFFi8QEgIVNYGmSPwV0GueIcsUev0NEfUug/viewform?usp=sf_link), make sure all 60 chunks are downloaded completely, then use the following command to merge them into a single archive file and extract it: + +```bash +cat glm-130b-sat.tar.part_* > glm-130b-sat.tar +tar xvf glm-130b-sat.tar +``` + +Set `CHECKPOINT_PATH` in `configs/model_glm_130b.sh` to the path of the extracted folder. Since the checkpoint file is up to 260G, it is recommended to use the SSD or RAM disk to reduce the checkpoint loading time. Since the checkpoint we distribute is in 8-way tensor parallel, a conversion scripts is also provided if you need to change the tensor parallel dimension. + +```bash +python tools/convert_tp.py \ + --input-folder \ + --output-folder \ + --target-tp +``` + +### Left-To-Right Generation / Blank Filling + +```bash +bash scripts/generate.sh --input-source interactive +``` + +You can also specify an input file by `--input-source input.txt`. + +GLM-130B uses two different mask tokens: `[MASK]` for short blank filling and `[gMASK]` for left-to-right long text generation. When the input does not contain any MASK token, `[gMASK]` will be automatically appended to the end of the text. + +
+Left-To-Right Generation Examples + +#### Example 1 +Input: Who is the greatest artist? The greatest artist is [gMASK] + +Output: Who is the greatest artist? The greatest artist is not the one who paints the most beautiful picture, or writes the best poem, or plays the best symphony. The greatest artist is the one whose work has the greatest impact on the lives of others. The Greatest Artist is the artist who inspires us to be our best selves. The Great Artist is not the artist whose work is flawless. The great artist is not one who is perfect. The true great artist has the ability to make us feel something. The artist who makes us feel is the great artist. The art of the great artists is to make you feel something that you have never felt before. The greatness of an artist is in the ability of their work to touch the lives and hearts of others, and to make them feel something they have never experienced before. This is the true greatness of the artist. 2. The Significance of the Great Artist The significance of the greatest artist cannot be overstated. The significance and impact of the art of a great artist cannot even be measured. The impact of a single work of art can change the course of history. A single work can change a person’s life. A work of great art can make you laugh, make you cry, make your heart race, or make you think. A great work of artistic expression has the power to change the way you look at the world. The work of a truly great artist can change your life forever. 3. The Impact of the Greatest Artist The greatest artists have the ability and the power of their art to change people’s lives. They have the power and the ability, through their art, to make people feel something new. The power of the work of the truly great artists cannot be measured, but it can be felt. The greater the impact of an artists work, the greater the power it has to change lives. 4. The Influence of the Art of the Artist The art and the work created by the greatest artists has the potential to change a life. The influence of the artists work cannot be quantified, but is felt by those who experience it. The truly great work has an ability to touch people’ lives in a way that no other work can. + +#### Example 2 (Chinese) +Input: 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答: [gMASK] + +Output: 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答:避寒的话,建议去海南三亚。 三亚位于海南岛的最南端,是中国最南部的热带滨海旅游城市,中国海滨城市,是中国空气质量最好的城市、全国最长寿地区(平均寿命80岁)。三亚市别称鹿城,又被称为"东方夏威夷",位居中国四大一线旅游城市"三威杭厦"之首,拥有全岛最美丽的海滨风光。三亚拥有被无数城市嫉妒的清新空气,柔和海滩。在沙滩悠闲散步、沐浴傍晚温和阳光,在海边玩耍,在雨林里面呼吸健康,欣赏自然奇观,一切都是那么令人享受。 气候 三亚地处热带,属热带季风气候区域,终年气温高,寒暑变化不大,年平均气温25.5度。三亚最热的月份是6月,平均气温为28.5°C,极端最高气温为35.7°C ,三亚没有明显的冬季,而且冬季更是到海滨旅游的最好时候,所以三亚是全年全天候避寒、消暑、度假、旅游的好地方。不过,三亚最旺的旅游时间是从10月份到农历春节,这个时候的人特别多,房间也不好订,建议最好避开高峰。三亚的三个旅游高峰期是"五一"、"十一"、"春节",这三个节日期间的房价都会上浮几倍,如果你选择了这三个假期来三亚的话要提前一个星期定房和咨询。 +
+ +
+Blank Filling Examples + +#### Example 1 +Input: Ng is an adjunct professor at [MASK] (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai. + +Output: Ng is an adjunct professor at Stanford University (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai.
+ +#### Example 2 (Chinese) +Input: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。 + +Output: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念拿破仑胜利而建,门高25米,顶上矗立两武士青铜古兵车铸像。 +
+ +
+Arguments Useful in Generation + +- `--input-source [path] or "interactive"` The input file's path. It can also be "interactive", which will launch a CLI. +- `—-output-path [path]` The folder containing the results. +- `—-out-seq-length [int]` The maximum sequence length for generation (including context). +- `—-min-gen-length [int]` The minimum generation length for each MASK. +- `—-sampling-strategy "BaseStrategy" or "BeamSearchStrategy"`. The sampling strategy used. + - For BeamSearchStrategy: + - `—-num-beams [int]` The number of beams. + - `—-length-penalty [float]` The maximum sequence length for generation (including context). + - `—-no-repeat-ngram-size [int]` Prohibit repeated n-gram generation. + - `—-print-all-beam` Print the generated results for all beams. + - For BaseStrategy: + - `—-top-k [int]` Top k sampling. + - `—-top-p [float]` Top p sampling. + - `—-temperature [float]` The sampling temperature. +
+ +### Evaluation + +We use the YAML file to define tasks. Specifically, you can add multiple tasks or folders at a time for evaluation, and the evaluation script will automatically collect all YAML files under those folders recursively. + +``` +bash scripts/evaluate.sh task1.yaml task2.yaml dir1 dir2 ... +``` + +Download our evaluation dataset [here](https://cloud.tsinghua.edu.cn/f/826f0df4356f4022a264/), and set `DATA_PATH` in `scripts/evaluate.sh` to your local dataset directory. The task folder contains the YAML files for 30+ tasks we evaluated for GLM-130B. Take the [CoLA](https://nyu-mll.github.io/CoLA/) task for example, run `bash scripts/evaluate.sh tasks/bloom/glue_cola.yaml`, which outputs an accuracy of ~65% for the best prompt and ~57% for the median. + +
+Expected Output + +```plain +MultiChoiceTaskConfig(name='glue_cola', type=, path='/thudm/LargeScale/data/zeroshot/bloom/glue_cola', module=None, metrics=['Accuracy'], use_task_mask=False, use_multitask_encoding=False, unidirectional=False, max_seq_length=2048, file_pattern={'validation': '**/validation.jsonl'}, micro_batch_size=8) +Evaluating task glue_cola: + Evaluating group validation: + Finish Following_sentence_acceptable/mul/validation.jsonl, Accuracy = 42.665 + Finish Make_sense_yes_no/mul/validation.jsonl, Accuracy = 56.951 + Finish Previous_sentence_acceptable/mul/validation.jsonl, Accuracy = 65.197 + Finish editing/mul/validation.jsonl, Accuracy = 57.622 + Finish is_this_correct/mul/validation.jsonl, Accuracy = 65.197 +Evaluation results of task glue_cola: + Group validation Accuracy: max = 65.197, median = 57.622, average = 57.526 +Finish task glue_cola in 101.2s. +``` +
+ +Multi-node evaluation can be configured by setting `HOST_FILE_PATH`(required by the [DeepSpeed lanucher](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node)) in `scripts/evaluate_multiple_node.sh`. Set `DATA_PATH` in `scripts/evaluate_multiple_node.sh` and run the following command to evaluate all the tasks in `./task` directory. + +``` +bash scripts/evaluate_multiple_node.sh ./tasks +``` + +See [Evaluate Your Own Tasks](docs/evaluate-your-own-tasks.md) for details on how to add new tasks. + +### 2.5X faster Inference using FasterTransformer + +By adapting the GLM-130B model to [FasterTransfomer](https://github.com/NVIDIA/FasterTransformer), a highly optimized transformer model library by NVIDIA, we can reach up to 2.5X speedup on generation, see [Inference with FasterTransformer](docs/inference-with-fastertransformer.md) for details. + + + +## License + +This repository is licensed under the [Apache-2.0 license](LICENSE). The use of GLM-130B model weights is subject to the [Model License](MODEL_LICENSE). + +## Citation + +If you find our work useful, please consider citing GLM-130B: + +``` +@inproceedings{ + zeng2023glm-130b, + title={{GLM}-130B: An Open Bilingual Pre-trained Model}, + author={Aohan Zeng and Xiao Liu and Zhengxiao Du and Zihan Wang and Hanyu Lai and Ming Ding and Zhuoyi Yang and Yifan Xu and Wendi Zheng and Xiao Xia and Weng Lam Tam and Zixuan Ma and Yufei Xue and Jidong Zhai and Wenguang Chen and Zhiyuan Liu and Peng Zhang and Yuxiao Dong and Jie Tang}, + booktitle={The Eleventh International Conference on Learning Representations (ICLR)}, + year={2023}, + url={https://openreview.net/forum?id=-Aw0rrrPUF} +} +``` + +You may also consider GLM's original work in your reference: + +``` +@inproceedings{du2022glm, + title={GLM: General Language Model Pretraining with Autoregressive Blank Infilling}, + author={Du, Zhengxiao and Qian, Yujie and Liu, Xiao and Ding, Ming and Qiu, Jiezhong and Yang, Zhilin and Tang, Jie}, + booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, + pages={320--335}, + year={2022} +} +``` diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/README_zh.md b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/README_zh.md new file mode 100644 index 0000000000000000000000000000000000000000..4a57b91c65b2b4b70c915a912ca69f4332f4fa66 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/README_zh.md @@ -0,0 +1,379 @@ + + +

+ 🌐 博客 • ⏬ 下载模型 • 🪧 样例演示 • 💬 讨论 • ✉️ 邮箱 • 💬 谷歌群组 or 微信群 + • 📃 论文(敬请期待)
+

+ +# GLM-130B:开放的中英双语预训练模型 + +## 摘要:何为 GLM-130B? + +GLM-130B 是一个开源开放的双语(中文和英文)双向稠密模型,拥有 1300 亿个参数,模型架构采用通用语言模型(GLM)。它旨在支持在**一台 A100(40G * 8)** 或 **V100(32G * 8)服务器**上对千亿规模的参数进行推理。截至 2022 年 7 月 3 日,GLM-130B 已经对超过 4000 亿个文本标识符(中文和英文各 2000 亿)进行了训练,它有以下独特优势: + +* **双语**:同时支持中文和英文。 +* **任务表现(英文)**: 在 LAMBADA 上优于 GPT-3 175B(+4.0%)、OPT-175B(+5.5%)和 BLOOM-176B(+13.0%),在 MMLU 上略优于GPT-3 175B(+0.9%)。 +* **任务表现(中文)**:在 7 个零样本 CLUE 数据集(+24.26%)和 5 个零样本 FewCLUE 数据集(+12.75%)上明显优于 ERNIE TITAN 3.0 260B。 +* **快速推理**:支持用一台 A100 服务器使用 [SAT](https://github.com/THUDM/SwissArmyTransformer) 和 [FasterTransformer](https://github.com/NVIDIA/FasterTransformer) 进行快速推理(速度最高可达2.5倍)。 +* **可复现性**:所有的结果(超过30个任务)都可以用我们开源的代码和模型参数轻松复现。 +* **多平台**:支持在 NVIDIA、Hygon DCU、Ascend 910 和 Sunway 处理器上进行训练与推理(代码即将开源)。 + +## 快速上手 + +### 环境配置 + +我们的代码是建立在 [SAT](https://github.com/THUDM/SwissArmyTransformer) 之上的。我们推荐使用 Miniconda 来管理环境并通过 `pip install -r requirements.txt` 来安装额外的依赖包。以下是我们推荐的环境配置: + +- Python 3.9+ / PyTorch 1.10+ / DeepSpeed 0.6+ / Apex(**需要安装包含 CUDA 和 C++ 扩展的版本,[参考资料](https://github.com/NVIDIA/apex/#linux)**) + +建议使用 A100(40G * 8)服务器,因为所有报告的评估结果(约30个任务)都可以用一台 A100 服务器在大约半天内轻松再现。GLM-130B 也可以在具有较小 GPU 内存的服务器上进行推断,例如具有 V100(32G * 8)的服务器。详见 [Low-resource Inference](docs/low-resource-inference.md)。 + +从 [这里](https://models.aminer.cn/glm/zh-CN/download/GLM-130B) 申请下载 GLM-130B 的模型检查点,确保所有 60 个块都已完全下载,然后使用以下命令将它们合并为一个存档文件并解压缩: + +```bash +cat glm-130b-sat.tar.part_* > glm-130b-sat.tar +tar xvf glm-130b-sat.tar +``` + +将 `configs/model_glm_130b.sh` 中的 `CHECKPOINT_PATH` 设置为解压文件夹的路径。 由于 checkpoint 文件最大 260G,建议使用 SSD 或 RAM 盘来减少 checkpoint 加载时间。 + +### 自回归文本生成 / 中间文本填空 + +```bash +bash scripts/generate.sh --input-source interactive +``` + +你也可以通过 `--input-source input.txt` 指定一个输入文件。 + +GLM-130B 使用两种不同的掩码标记:`[MASK]` 用于简短的填空任务,`[gMASK]` 用于从左到右的长文本生成。当输入不包含任何 MASK 标记时,`[gMASK]` 将被自动附加到文本的末尾。 + +
+长文本生成样例 + +#### Example 1 +Input: Who is the greatest artist? The greatest artist is [gMASK] + +Output: Who is the greatest artist? The greatest artist is not the one who paints the most beautiful picture, or writes the best poem, or plays the best symphony. The greatest artist is the one whose work has the greatest impact on the lives of others. The Greatest Artist is the artist who inspires us to be our best selves. The Great Artist is not the artist whose work is flawless. The great artist is not one who is perfect. The true great artist has the ability to make us feel something. The artist who makes us feel is the great artist. The art of the great artists is to make you feel something that you have never felt before. The greatness of an artist is in the ability of their work to touch the lives and hearts of others, and to make them feel something they have never experienced before. This is the true greatness of the artist. 2. The Significance of the Great Artist The significance of the greatest artist cannot be overstated. The significance and impact of the art of a great artist cannot even be measured. The impact of a single work of art can change the course of history. A single work can change a person’s life. A work of great art can make you laugh, make you cry, make your heart race, or make you think. A great work of artistic expression has the power to change the way you look at the world. The work of a truly great artist can change your life forever. 3. The Impact of the Greatest Artist The greatest artists have the ability and the power of their art to change people’s lives. They have the power and the ability, through their art, to make people feel something new. The power of the work of the truly great artists cannot be measured, but it can be felt. The greater the impact of an artists work, the greater the power it has to change lives. 4. The Influence of the Art of the Artist The art and the work created by the greatest artists has the potential to change a life. The influence of the artists work cannot be quantified, but is felt by those who experience it. The truly great work has an ability to touch people’ lives in a way that no other work can. + +#### Example 2 (Chinese) +Input: 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答: [gMASK] + +Output: 问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答:避寒的话,建议去海南三亚。 三亚位于海南岛的最南端,是中国最南部的热带滨海旅游城市,中国海滨城市,是中国空气质量最好的城市、全国最长寿地区(平均寿命80岁)。三亚市别称鹿城,又被称为"东方夏威夷",位居中国四大一线旅游城市"三威杭厦"之首,拥有全岛最美丽的海滨风光。三亚拥有被无数城市嫉妒的清新空气,柔和海滩。在沙滩悠闲散步、沐浴傍晚温和阳光,在海边玩耍,在雨林里面呼吸健康,欣赏自然奇观,一切都是那么令人享受。 气候 三亚地处热带,属热带季风气候区域,终年气温高,寒暑变化不大,年平均气温25.5度。三亚最热的月份是6月,平均气温为28.5°C,极端最高气温为35.7°C ,三亚没有明显的冬季,而且冬季更是到海滨旅游的最好时候,所以三亚是全年全天候避寒、消暑、度假、旅游的好地方。不过,三亚最旺的旅游时间是从10月份到农历春节,这个时候的人特别多,房间也不好订,建议最好避开高峰。三亚的三个旅游高峰期是"五一"、"十一"、"春节",这三个节日期间的房价都会上浮几倍,如果你选择了这三个假期来三亚的话要提前一个星期定房和咨询。 +
+ +
+文本填空样例 + +#### Example 1 +Input: Ng is an adjunct professor at [MASK] (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai. + +Output: Ng is an adjunct professor at Stanford University (formerly associate professor and Director of its Stanford AI Lab or SAIL ). Also a pioneer in online education, Ng co-founded Coursera and deeplearning.ai.
+ +#### Example 2 (Chinese) +Input: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念[MASK]而建,门高25米,顶上矗立两武士青铜古兵车铸像。 + +Output: 凯旋门位于意大利米兰市古城堡旁。1807年为纪念拿破仑胜利而建,门高25米,顶上矗立两武士青铜古兵车铸像。 +
+ + +
+控制生成的主要超参数 + +- `--input-source [path] or "interactive"`. 输入文件的路径。当设为"interactive"时,将会启动交互式CLI。 +- `—-output-path [path]`. 结果输出路径。 +- `—-out-seq-length [int]`. (包括输入内容在内的)最大输出序列长度。 +- `—-min-gen-length [int]` 每个MASK标识符位置的最小生成长度。 +- `—-sampling-strategy "BaseStrategy" or "BeamSearchStrategy"`. 生成的采样策略。 + - 对于 BeamSearchStrategy(集束搜索): + - `—-num-beams [int]`. 集束数目。 + - `—-length-penalty [float]`. (包括输入内容在内的)生成长度惩罚项;数值范围[0, 1],数值越大生成长度越长。 + - `—-no-repeat-ngram-size [int]`. 禁止重复生成的n-gram长度。 + - `—-print-all-beam`. 是否打印每一束搜索结果。 + - For BaseStrategy: + - `—-top-k [int]`. Top k 采样。 + - `—-top-p [float]`. Top p 采样。 + - `—-temperature [float]` . 采样时设置的温度项。 +
+ +### 评估 + +我们使用YAML文件来定义任务。具体来说,你可以一次添加多个任务或文件夹进行评估,评估脚本会自动递归地收集这些文件夹下的所有YAML文件。 + +``` +bash scripts/evaluate.sh task1.yaml task2.yaml dir1 dir2 ... +``` + +[从这里](https://cloud.tsinghua.edu.cn/f/9257ee84045644b8ac06/)下载我们的评估数据集,并在 `scripts/evaluate.sh` 中设置 `DATA_PATH` 为你的本地数据集目录。任务文件夹包含我们为 GLM-130B 评估的 30 多个任务的 YAML 文件。以 [CoLA](https://nyu-mll.github.io/CoLA/) 任务为例,运行 `bash scripts/evaluate.sh tasks/bloom/glue_cola.yaml`,其输出的最佳提示准确率约为 65%,中值约为 57%。 + +
+预期输出 + +```plain +MultiChoiceTaskConfig(name='glue_cola', type=, path='/thudm/LargeScale/data/zeroshot/bloom/glue_cola', module=None, metrics=['Accuracy'], use_task_mask=False, use_multitask_encoding=False, unidirectional=False, max_seq_length=2048, file_pattern={'validation': '**/validation.jsonl'}, micro_batch_size=8) +Evaluating task glue_cola: + Evaluating group validation: + Finish Following_sentence_acceptable/mul/validation.jsonl, Accuracy = 42.665 + Finish Make_sense_yes_no/mul/validation.jsonl, Accuracy = 56.951 + Finish Previous_sentence_acceptable/mul/validation.jsonl, Accuracy = 65.197 + Finish editing/mul/validation.jsonl, Accuracy = 57.622 + Finish is_this_correct/mul/validation.jsonl, Accuracy = 65.197 +Evaluation results of task glue_cola: + Group validation Accuracy: max = 65.197, median = 57.622, average = 57.526 +Finish task glue_cola in 101.2s. +``` +
+ +可以通过在 `scripts/evaluate_multiple_node.sh` 中设置 `HOST_FILE_PATH`([DeepSpeed lanucher](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) 要求)来配置多节点评估。在 `scripts/evaluate_multiple_node.sh` 中设置 `DATA_PATH` 并运行以下命令来评估`./task`目录中的所有任务。 + +``` +bash scripts/evaluate_multiple_node.sh ./tasks +``` + +关于如何添加新任务的细节,请参见 [评估你自己的任务](docs/evaluate-your-own-tasks.md)。 + +### 使用 FasterTransformer 加速推理速度(高达 2.5 倍) + +- 通过将 GLM-130B 模型与 [FasterTransfomer](https://github.com/NVIDIA/FasterTransformer)(NVIDIA 高度优化的 Transformer 模型库)相适应,我们可以在生成时达到 2.5 倍的速度,详见 [Inference with FasterTransformer](docs/inference-with-fastertransformer.md) 。 + + +## 何为GLM-130B? + +GLM-130B是一个开放的双语(中文与英文)双向语言模型,含1300亿个参数。截至2022年7月,它已经训练了超过4000亿个文本标记。它的底层架构基于[通用语言模型(GLM)](https://aclanthology.org/2022.acl-long.26/),在语言理解和语言生成任务上均展示出强大的性能。 + +### 架构 + +GLM-130B将BERT和GPT的目标进行了统一,并与最近提出的一些技术进行结合以提升语言模型的性能表现。 + +#### 1\. 训练目标:自回归文本填空 + +GLM利用自回归文本填空作为其主要的预训练目标。它掩盖了随机的连续跨度(例如,下面的例子中的 "complete unknown"),并对其进行自回归预测。上下文之间的注意力(例如,"like a [MASK], like a rolling stone")是双向的。相反,被掩盖的标记之间的注意力,和从上下文到被掩盖的标识符的注意力是自回归掩码的。 + + + +在GLM-130B的实现中,有两种不同的MASK标识符,表示两个不同的目的: + +* `[MASK]`根据[泊松分布](https://en.wikipedia.org/wiki/Poisson_distribution) (λ=3)对输入中标识符进行短跨度的采样; +* `[gMASK]`掩盖一个长的跨度,从其位置到整个文本的结束。 + +`[sop]`标识符表示一个片断的开始,`[eop]`表示一个片断的结束。这两个目标在GLM-130B的预训练中是混合的,分别占预训练标记的30%和70%。 + +| | +|:--:| +| *例如:GLM-130B是如何对 `"like a complete unknown, like a rolling stone"`进行预训练的* | + +#### 2\. 位置编码:旋转位置编码 + +GLM-130B使用[旋转位置编码(RoPE)](https://arxiv.org/abs/2104.09864),谷歌的[PaLM](https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html)和[ElutherAI](https://www.eleuther.ai/)的GPT-*系列也采用这种编码。RoPE是一种相对位置编码,它利用复数空间的正交投影矩阵来表示标识符的相对距离。还有其他的相对位置编码选项,如Bigscience的[BLOOM](https://huggingface.co/bigscience/bloom)所使用的[AliBi](https://arxiv.org/abs/2108.12409)。但在我们的初步实验中,我们发现。 + +* 当序列长度增长时,RoPE的实现速度更快。 +* RoPE对双向注意力更友好,在下游微调实验中效果更好 + +因此,对于GLM-130B,RoPE是一种有效的、高效的位置编码。 + +#### 3\. 归一化:使用DeepNet的Post-LN + +层归一化(LayerNorm,或LN)是transformer中的一个重要组成部分,其应用可以大大影响训练的稳定性和性能。BERT应用了Post-LN,这意味着LayerNorm是在添加残余分支后应用的。然而,[后续工作](https://arxiv.org/abs/2002.04745)表明,单纯的Post-LN会导致预训练的不稳定,因此现有的大规模模型都选择Pre-LN架构,即在添加残差分支之前应用LayerNorm。 + +| | +|:--:| +| *(a) Post-LN在下游任务中表现更佳;(b) Post-LN + DeepNorm 比 Sandwich-LN 要更加稳定* | + +尽管如此,在现有的实践中,Pre-LN在用FP16训练大规模模型时仍然可能不稳定。[OPT-175B](https://arxiv.org/abs/2205.01068)在训练崩溃时手动调整学习率;[BLOOM](https://huggingface.co/bigscience/bloom)使用BF16(仅适用于NVIDIA Ampere GPU:A100s和3090s)以获得更好的浮点精度来避免崩溃。[CogView](https://proceedings.neurips.cc/paper/2021/file/a4d92e2cd541fca87e4620aba658316d-Paper.pdf)提出了Sandwich-LN作为一种补救措施。更重要的是,[近期工作](https://aclanthology.org/2021.findings-acl.81.pdf)表明,与Post-LN相比,Pre-LN的下游微调性能更差。 + +考虑到所有这些因素,在GLM-130B中,我们决定使用Post-LN,并使用新提出的[DeepNorm](https://arxiv.org/abs/2203.00555)来克服不稳定性。DeepNorm的重点是改进初始化,可以帮助Post-LN变换器扩展到1000层以上。在我们的初步实验中,模型扩展到130B,Sandwich-LN的梯度在大约2.5k步时就会出现损失突变(导致损失发散),而带有DeepNorm的Post-Ln则保持健康并呈现出较小的梯度大小(即更稳定)。 + +#### 4\. 前馈网络:Gated Linear Unit (GLU) + GeLU 激活 + +最近一些改进transformer结构的努力集中在前馈网络(FFN)上,包括用[GLU](https://arxiv.org/abs/1612.08083)(在PaLM中采用)和新提出的[门控注意单元(GAU)](https://arxiv.org/abs/2202.10447)取代它。 + +| | RTE | COPA | BoolQ | WSC | Average | +|------------------------------|------------|------------|------------|------------|---------| +| GLM-base (GeGLU-Sandwich_LN) | 71.00±0.61 | 77.00±1.63 | 77.24±0.43 | 78.21±1.81 | 75.08 | +| GLM-base (GAU-Pre_LN) | | | _diverged_ | | | +| GLM-base (GAU-Sandwich_LN) | 69.92±0.61 | 75.67±0.94 | 77.00±0.15 | 72.44±1.81 | 74.20 | +| GLN-base (FFN-Sandwich_LN) | 71.00±0.74 | 72.33±1.70 | 76.75±0.05 | 73.72±2.40 | 73.36 | + +我们在初步实验中通过对随机的50G中英文混合语料库进行GLM-base(110M)的预训练来测试它们。我们发现,虽然GLU和GAU可以比原始FFN实现更好,但GLU在训练中可以更好、更稳定。 + +因此,在GLM-130B的实现中,我们选择带有GeLU激活的GLU,即GeGLU。GeGLU需要三个投影矩阵;为了保持相同数量的参数,与只利用两个矩阵的FFN相比,我们将其隐藏状态减少到2/3。 + +#### 总结 + +基于以上所有设计,GLM-130B的参数配置为: + +| 层数 | 隐层维度 | GeGLU 隐层维度 | 注意力头数量 | 最大序列长度 | 词表大小 | +|--------|--------------|--------------------|-----------------|---------------------|-------------| +| 70 | 12,288 | 32,768 | 96 | 2,048 | 150,000 | + +该词表和分词器是基于[icetk](https://github.com/THUDM/icetk)实现的。icetk是一个统一的图像、中文和英文的多模态标记器。 + +### 训练 +训练大规模语言模型的最关键挑战是**训练的稳定性**,无一例外。GLM-130B的预训练持续了60天,使用96个DGX-A100(40G)节点,等价花费490万美元的云服务费用;如果训练在半路上失败,并无法恢复训练,那将是一个巨大的损失。 + +| | +|:--:| +| *所有模型都面临训练不稳定,它可能发生在预训练的开始、中间或结束阶段(图(a)和(b)分别取自OPT和BLOOM)* | + +不幸的是,据我们观察,大模型比我们认为的那些小模型更容易受到不可避免的噪音数据和意外涌现的梯度影响。原因是,在训练效率和稳定性之间存在着权衡: + +* **效率**:我们需要一个低精度的浮点格式(如FP16),以减少内存和计算成本; +* **稳定性**:低精度浮点格式容易出现溢出和下溢。 + +而为了平衡这两个要素,我们以及最近的开放性大型模型(如[OPT-175B](https://arxiv.org/abs/2205.01068)、[BLOOM](https://huggingface.co/bigscience/bloom))都付出了巨大的努力来寻找解决方案。在此,我们提出我们的答案。 + +#### 1\. 浮点数格式:FP16 混合精度 + +FP16混合精度已经成为主流大规模模型训练框架的默认选项,用于训练十亿到百亿规模的模型。但其仍太容易遇到精度问题。作为补救措施,NVIDIA Ampere GPU提供了BF16浮点格式(被[BLOOM](https://huggingface.co/bigscience/bloom)采用)来缓解这个问题。然而,BF16在其他平台上不被支持,这大大缩小了它在更广泛的应用中的潜力。 + +为了让更多开发者使用,GLM-130B仍然选择FP16作为其训练浮点格式。同时,这意味着GLM-130B将面临着更多的稳定性挑战。幸运的是,经过多次尝试,我们发现以下的训练策略最终有助于稳定GLM-130B的训练。 + +#### 2\. 嵌入层:梯度缩减 + +我们观察到,在训练的早期阶段,嵌入层的梯度范数明显比其他层大。根据经验,我们发现大多数训练崩溃都发生在其梯度范数激增之后。为了解决这个问题,[BLOOM](https://huggingface.co/bigscience/bloom)汇报了使用[嵌入归一化](https://openreview.net/pdf?id=rI7BL3fHIZq)(我们也发现它能稳定训练),但同时,其牺牲了相对较大的下游性能。 + +由于根本问题是输入嵌入层的急剧梯度,我们建议缩小输入嵌入层的梯度。实现起来相当简单。 + +```python +word_embedding = word_embedding * α + word_embedding.detach() * (1 - α) +``` + +这就把梯度缩小到`α`。在我们的实践中,我们发现`α=0.1`对GLM-130B是最好的。 + +| ![EmbeddingShrink.png](resources/03DF31017FE184DB45D41DFFC6F80EF0.png) | +|:--:| +| *(a) 嵌入层的梯度范数在早期阶段比其他部分大得多
(b) 嵌入梯度缩减的初步实验 (alpha=0.1)* | + +在我们的初步实验中,我们观察到,对于早期阶段的训练来说,缩小嵌入梯度并没有减缓收敛速度;相反,没有缩小梯度的模型会出现意外的尖峰,并在5k步左右出现训练崩溃的情况。 + +#### 3\. 注意力计算:FP32 Softmax + +梯度收缩是一种避免训练崩溃的事后技术。从本质上讲,崩溃是由异常的损失 "梯度"形成的,要么是由于噪声数据,要么是正向计算中的精度上溢或者下溢。 + +| ![scale.png](resources/7CB441707D1035B2890AA2164C5B6EAC.png) | +|:--:| +| *每个注意力头计算出的注意力得分有非常不同的数值范围(摘自[CogView](https://proceedings.neurips.cc/paper/2021/file/a4d92e2cd541fca87e4620aba658316d-Paper.pdf))* | + +我们观察到,在大型语言模型中,注意力的计算操作是最容易上溢或下溢的。[CogView](https://proceedings.neurips.cc/paper/2021/file/a4d92e2cd541fca87e4620aba658316d-Paper.pdf)显示,不同的注意力头对其注意力分数有非常不同的数值范围,有些注意力头计算出的平均分数可以达到+1e4或-1e-3。这种不同的数值范围会导致在softmax计算中FP16下的频繁上溢或下溢。CogView提出了精度瓶颈放松(PB-Relax)来缓解这个问题,它在做softmax之前扣除了每个头的注意力得分矩阵中的最大绝对值。 + +然而,事实证明,PB-Relax在GLM-130B的训练中很慢,可能是因为在96个大小为2048*2048的注意分数矩阵中寻找最大值和操作标量对CUDA内核不友好。最后,经过几周的艰苦探索,我们发现避免这一问题的最快和最简单的方法是在softmax计算中使用FP32。与完全的FP16计算相比,它几乎没有任何速度上的损失,但明显提高了训练的稳定性。 + + + +### 预训练数据 + +#### 自监督预训练 + +我们在2.5T网络爬取的语料上,对GLM-130B进行了预训练,包括英文1.2T来自Pile的语料和1.3T中文语料. + +#### 多任务指令预训练(Multi-Task Instruction Pre-Training,MIP) + +同时,[FLAN](https://arxiv.org/pdf/2109.01652.pdf)和[T0](https://arxiv.org/pdf/2110.08207.pdf)的最新进展表明,大规模语言模型的多提示多任务指令微调可以促进更好的零样本学习能力。此外,正如[T5](https://www.jmlr.org/papers/volume21/20-074/20-074.pdf?ref=https://githubhelp.com)和[ExT5](https://arxiv.org/pdf/2111.10952.pdf)所指出的,将多任务的下游数据合并到预训练中,甚至比多任务微调更有帮助。 + +因此,在GLM-130B的预训练中,我们包括了许多从自然语言理解到生成的提示数据集,作为自监督预训练的补充。我们设定95%的标记来自自监督的预训练语料,5%的训练标记来自MIP数据集。这些数据集是从[T0](https://arxiv.org/pdf/2110.08207.pdf)和[DeepStruct](https://arxiv.org/pdf/2205.10475.pdf)中收集和转换的。按照T0的做法,每个多提示数据集中的样本都应被截断到最大数量(一般来说,T0数据集为100k,DeepStruct数据集为200k)。 + +不幸的是,由于数据准备中的一个错误,在前20k个预训练步骤中,我们意外地包括了T0++的所有数据集(其中包括最初用于评估T0中零样本任务泛化的任务)、没有调成权重进行截断、并排除了所有DeepStruct数据集。虽然我们把这个问题在20000步时进行了修正,但GLM-130B似乎对训练样本的记忆非常好,直到50000步也没有出现大量遗忘的现象,因此我们在此提醒所有用户***切勿在这个[列表](resources/multitask_list.txt)的数据集上评估GLM-130B在零样本或少样本学习的性能。 + +## GLM-130B表现如何? + +众所周知,像[GPT-3](https://arxiv.org/pdf/2005.14165.pdf)这样的大规模语言模型是优秀的少样本和零样本学习器。与GPT-3和OPT-175B的零样本学习相比,GLM-130B有一些架构上的劣势。首先,它是一个双语语言模型,不能像GPT-3(350B tokens)和OPT-175B(350B tokens)那样看到很多英语标记(GLM-130B大概见到了200B 英文tokens)。第二,GLM-130B的参数比GPT-3(175B)和OPT-175B少。 + +尽管有这些缺点,GLM-130B仍有上述的许多技术改进,这可能会弥补其在零点学习性能方面的差距。 + +* **双向注意力**。GLM-130B是一个类似于BERT的双向模型,而现有的大型语言模型主要是GPT(单向的)。双向模型在语言理解和条件生成方面远远优于GPT。 +* **改进的架构设计**。GLM-130B采用了新的架构设计,包括GeGLU、RoPE和DeepNorm。这些技术已被证明可以提高语言模型的性能。 +* **多任务指令预训练**。正如[FLAN](https://arxiv.org/pdf/2109.01652.pdf)和[T0](https://arxiv.org/pdf/2110.08207.pdf)所指出的,多任务指令预训练有助于提高零样本学习性能。 + +从目前的中间结果来看,GLM-130B在中文与英文中都是一个强大的零样本学习器。具体来说,它的表现是 + +* 在英语中与GPT-3 175B相当。 +* 在英语中优于BLOOM-176B和OPT-175B。 +* 在中文方面比ERNIE 3.0 Titan(260B)更好。 + +```diff +- 请注意,本节中的所有结果目前都是中间结果,不代表最终性能。 +``` + +### 讨论:GLM-130B的零样本学习设置 + +由于GLM-130B利用了多任务指令预训练(MIP),我们认为有必要澄清我们对零样本学习的设定。该问题似乎没有官方认可的定义,而社区中也存在许多不同的解释。我们参考了影响力较大的零样本学习[综述](https://ieeexplore.ieee.org/abstract/document/8413121)中的定义,其指出。 + +``` +At test time, in zero-shot learning setting, the aim is to assign a test image to an unseen class label, and in generalized zero-shot learning setting, the test image can be assigned either to seen or unseen classes. +``` + +其中,被评估的任务是否涉及未见过的类标签是一个关键。考虑到NLP的实际情况,我们为GLM-130B零样本学习评估挑选数据集的原则如下。 + +* 英文 + + 对于有固定标签的任务(如自然语言推理):同一任务中的任何数据集都不应该被评估。 + + 对于没有固定标签的任务(例如,问题回答,主题分类):只应考虑:1)相比MIP中数据集具有明显的领域转移,且 2)与MIP中的标签不同的数据集 +* 中文:所有的数据集都可以被评估 + +我们欢迎更多关于这个话题的讨论,以促进整个社区对零样本学习的研究。 + +### 零样本学习:英文 + +我们在各种不同的下游任务中测试GLM-130B。请注意,我们仍在经历评估阶段;这些结果不是最终结果,而是**中间结果**。 + +#### 语言建模(LAMBADA) +语言建模测试的是语言模型在给定其前缀语境下预测下一个单词的内在能力。我们以[LAMBADA](https://aclanthology.org/P16-1144/)为例,它是一项具有挑战性的零样本末位单词预测任务,在评估现有大规模语言模型时被广泛采用。 + +我们绘制了GLM-130B的零样本LAMBADA(En)性能,以及GPT-3 175B、OPT 175B和BLOOM 176B(OPT和BLOOM的中间结果取自[BLOOM的评估库](https://github.com/bigscience-workshop/evaluation-results/tree/676f6a8cf27d4df30b073fb490deb9e359da64aa))。与其他三个使用上下文自回归的GPT式模型相比,我们提出了GLM-130B的两个版本。 + +* **GLM-130B (bi)**对前缀上下文有双向的关注。 +* **GLM-130B (uni)**遵循传统的GPT风格,对前缀语境进行自回归注意力。 + +如图所示,双向注意力可以用较少的模型参数达到更好的性能。 + +|

| +|:--:| +| *与其他大规模语言模型相比,GLM-130B的零样本 LAMBADA(En)性能* | + +#### MMLU(大规模多任务语言理解) + +[MMLU](https://arxiv.org/pdf/2009.03300.pdf) 是一个多样化的基准数据集,包括57个关于人类知识的多选题回答任务,范围从高中水平到专家水平。它可以作为大规模语言模型少样本学习性能的理想测试平台。 + +我们绘制了GLM-130B在其训练过程上的少样本学习(5-shot)性能。GLM-130B在学习了大约3000亿个tokens后,接近GPT-3的可比性能43.9。随着训练的进行,它的能力继续增长,在学习了4000亿个tokens后达到了44.8。当我们的训练终止时,它似乎并没有饱和,这与[Chinchilla](https://arxiv.org/pdf/2203.15556.pdf)中的观察相一致,即现有的大规模语言模型仍然远远没有得到充分的训练。 + +|

| +|:--:| +| *与其他大规模语言模型相比,GLM-130B的少样本学习(5-shot)MMLU性能* | + +### 零样本学习:中文 + +由于GLM-130B是一个双语语言模型,我们也评估了它在既有的中文NLP基准上的零样本性能:[CLUE](https://arxiv.org/pdf/2004.05986.pdf) 和[FewCLUE](https://arxiv.org/pdf/2107.07498.pdf)。请注意,我们在多任务指令预训练(MIP)中不包括任何中文下游数据集。由于仍在评估阶段,我们目前仅评估了7个CLUE数据集和5个FewCLUE数据集。更多数据集上的结果会在之后公布。 + +我们将GLM-130B与现有最大的中文单语语言模型ERNIE Titan 3.0进行比较,后者有260B的参数。如图所示,GLM-130B的表现优于ERNIE Titan 3.0,尤其是在生成式阅读理解数据集DRCD和CMRC2018上。 + +| | +|:--:| +*部分CLUE和FewCLUE基准数据集的零点性能。跟随ERNIE Titan 3.0的做法,我们报告了开发数据集的结果。除了DRCD和CMRC2018的报告EM外,其他数据集报告Acc.* | + +
+致谢 + +这一项目由国家自然科学基金国家杰出青年科学基金项目(No. 61825602)支持。 + +### 学生负责人 +[曾奥涵(清华大学计算机系知识工程实验室)](https://github.com/Sengxian),[刘潇(清华大学计算机系知识工程实验室)](https://github.com/xiao9905) + +### 技术贡献 +#### 清华大学计算机系知识工程实验室——the Knowledge Engineering Group at Tsinghua +杜政晓,丁铭,郑勤锴,赖瀚宇,汪子涵,杨卓毅,于济凡,张笑涵,郑问迪,夏箫,徐逸凡,谭咏霖,东昱晓,唐杰 + +#### 清华大学计算机系PACMAN实验室——the Parallel Architecture & Compiler technology of Mobile, Accelerated, and Networked systems Group at Tsinghua +马子轩,何家傲,孙桢波,翟季冬,陈文光 + +#### 清华大学计算机系自然语言处理实验室(BMInf)——the Natural Language Processing Group at Tsinghua +曾国洋,韩旭,赵威霖,刘知远 + +#### 智谱AI——an AI startup that aims to teach machines to think like humans +薛宇飞,王山,陕杰才,姜皓瀚,郭振钢,张鹏 + +### 计算资源赞助 +智谱AI + +### 项目总负责 +[唐杰(清华大学计算机系知识工程实验室 & 北京智源人工智能研究院)](http://keg.cs.tsinghua.edu.cn/jietang/) + +
+ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/LICENSE b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..94a1cd6d0046a9aae3b22a035d4a3965ad40f3b9 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2021 Ming Ding + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/README.md b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7719d4765ba83d4f8eac933a0aa4b464daaf7fc4 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/README.md @@ -0,0 +1,5 @@ +# README + +GLM-130B模型是基于[SwissArmyTransformer](https://github.com/THUDM/SwissArmyTransformer)库实现的(简称SAT),而该库中包含了大量的其他模型实现 + +在此GLM-130B适配NPU需要对SAT库进行一定的修改,在这个文件夹下完成这些修改适配工作,用户请下载SAT库后,将此文件下的文件覆盖至SAT根目录下,然后按照其文档进行setup以应用修改 diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/__init__.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..1af20b9155a16f4647b150da958d1236cae49c50 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/__init__.py @@ -0,0 +1,6 @@ +from .arguments import get_args, update_args_with_file +from .training.deepspeed_training import training_main +from .tokenization import get_tokenizer +from .model import AutoModel + +from torch_npu.contrib import transfer_to_npu \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/model/official/glm130B_model.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/model/official/glm130B_model.py new file mode 100644 index 0000000000000000000000000000000000000000..baada01837dff8a6086ca727ee95c9160224207c --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/model/official/glm130B_model.py @@ -0,0 +1,378 @@ +import math +import torch +import enum +from torch.nn import functional as F + +from SwissArmyTransformer import mpu +from SwissArmyTransformer.transformer_defaults import standard_attention +from SwissArmyTransformer.mpu.utils import split_tensor_along_last_dim, divide +from SwissArmyTransformer.mpu.layers import ColumnParallelLinear +from SwissArmyTransformer.model.base_model import BaseModel, BaseMixin +from SwissArmyTransformer.model.position_embedding import RotaryEmbedding +from SwissArmyTransformer.model.position_embedding import apply_rotary_pos_emb_index + +# flags required to enable jit fusion kernels +torch._C._jit_set_profiling_mode(False) +torch._C._jit_set_profiling_executor(False) +torch._C._jit_override_can_fuse_on_cpu(True) +torch._C._jit_override_can_fuse_on_gpu(True) + +# try: +# from apex.transformer.functional import FusedScaleMaskSoftmax +# from apex.transformer.enums import AttnMaskType +# except ModuleNotFoundError: +# print( +# "Please install apex to use FusedScaleMaskSoftmax, otherwise the inference efficiency will be greatly reduced" +# ) +# FusedScaleMaskSoftmax = None + +class AttnMaskType(enum.Enum): + padding = 1 + causal = 2 + +FusedScaleMaskSoftmax = None + + +class RotaryEmbeddingMixin(BaseMixin): + def __init__( + self, + fp16: bool, + hidden_size: int, + num_attention_heads: int, + model_parallel_size: int, + position_encoding_2d: bool + ): + super().__init__() + hidden_size_per_attention_head = divide(hidden_size, num_attention_heads) + self.hidden_size_per_attention_head = hidden_size_per_attention_head + self.num_attention_heads_per_partition = divide(num_attention_heads, model_parallel_size) + self.position_encoding_2d = position_encoding_2d + self.rotary_emb = RotaryEmbedding( + hidden_size_per_attention_head // 2 + if position_encoding_2d + else hidden_size_per_attention_head, + base=10000, + precision=torch.half if fp16 else torch.float, + learnable=False, + device=torch.cuda.current_device(), + ) + + def attention_forward(self, hidden_states, mask, **kw_args): + attn = self.transformer.layers[kw_args["layer_id"]].attention + attention_fn = standard_attention + if "attention_fn" in attn.hooks: + attention_fn = attn.hooks["attention_fn"] + + # [seq, b, 3 * hn * np] + mixed_raw_layer = attn.query_key_value(hidden_states) + + # [seq, b, (np * 3 * hn)] --> [seq, b, np, 3 * hn] + new_tensor_shape = mixed_raw_layer.size()[:-1] + ( + self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head, + ) + mixed_raw_layer = mixed_raw_layer.view(*new_tensor_shape) + + # [sq, b, np, hn] + (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_raw_layer, 3) + + dropout_fn = attn.attention_dropout if attn.training else None + + position_ids = kw_args["position_ids"] + if self.position_encoding_2d: + q1, q2 = query_layer.chunk(2, dim=(query_layer.ndim - 1)) + k1, k2 = key_layer.chunk(2, dim=(key_layer.ndim - 1)) + cos, sin = self.rotary_emb(q1, seq_len=position_ids.max() + 1) + position_ids, block_position_ids = position_ids[:, 0, :].transpose(0, 1).contiguous(), \ + position_ids[:, 1, :].transpose(0, 1).contiguous() + q1, k1 = apply_rotary_pos_emb_index(q1, k1, cos, sin, position_ids) + q2, k2 = apply_rotary_pos_emb_index(q2, k2, cos, sin, block_position_ids) + query_layer = torch.concat([q1, q2], dim=(q1.ndim - 1)) + key_layer = torch.concat([k1, k2], dim=(k1.ndim - 1)) + else: + position_ids = position_ids.transpose(0, 1) + cos, sin = self.rotary_emb(value_layer, seq_len=position_ids.max() + 1) + query_layer, key_layer = apply_rotary_pos_emb_index(query_layer, key_layer, cos, sin, position_ids) + + context_layer = attention_fn(query_layer, key_layer, value_layer, mask, dropout_fn, **kw_args) + + output = attn.dense(context_layer) + + if attn.training: + output = attn.output_dropout(output) + + return output + + +class GEGLU(torch.nn.Module): + def __init__(self): + super().__init__() + self.activation_fn = F.gelu + + def forward(self, x): + # dim=-1 breaks in jit for pt<1.10 + x1, x2 = x.chunk(2, dim=(x.ndim - 1)) + return x1 * self.activation_fn(x2) + + +class DeepNormWithGLUMixin(BaseMixin): + def __init__(self, num_layers, hidden_size, inner_hidden_size=None): + super().__init__() + self.num_layers = num_layers + self.hidden_size = hidden_size + if inner_hidden_size is None: + inner_hidden_size = 4 * hidden_size * 2 // 3 + self.inner_hidden_size = inner_hidden_size + + def reinit(self): + for layer in self.transformer.layers: + del layer.mlp.dense_h_to_4h + layer.mlp.dense_h_to_4h = ColumnParallelLinear( + self.hidden_size, + 2 * self.inner_hidden_size, + gather_output=False, + bias=True, + params_dtype=torch.half, + module=self, + name="dense_h_to_4h", + skip_init=True, + ) + del layer.mlp.activation_func + layer.mlp.activation_func = GEGLU() + + def layer_forward(self, hidden_states, mask, *args, **kw_args): + """ + hidden_states: [seq_len, batch, hidden_size] + mask: [(1, 1), seq_len, seq_len] + """ + layer = self.transformer.layers[kw_args["layer_id"]] + # Layer norm at the begining of the transformer layer. + + attention_input = layer.input_layernorm(hidden_states) + + # Self attention. + attention_output = layer.attention(attention_input, mask, **kw_args) + + # Residual connection. + alpha = (2 * self.num_layers) ** 0.5 + hidden_states = attention_input * alpha + attention_output + + mlp_input = layer.post_attention_layernorm(hidden_states) + + # MLP. + mlp_output = layer.mlp(mlp_input, **kw_args) + + # Second residual connection. + output = mlp_input * alpha + mlp_output + + return output + + +class SelfAttentionWithFP32SoftmaxMixin(BaseMixin): + def __init__(self, hidden_size, num_attention_heads, model_parallel_size): + super().__init__() + self.hidden_size_per_attention_head = divide(hidden_size, num_attention_heads) + self.hidden_size_per_partition = divide(hidden_size, model_parallel_size) + self.scale_mask_softmax = None + if FusedScaleMaskSoftmax is not None: + self.scale_mask_softmax = FusedScaleMaskSoftmax( + input_in_fp16=True, + input_in_bf16=False, + attn_mask_type=AttnMaskType.padding, + scaled_masked_softmax_fusion=True, + mask_func=self.attention_mask_func, + softmax_in_fp32=True, + scale=1, + ) + + @staticmethod + def attention_mask_func(attention_scores, attention_mask): + attention_scores.masked_fill_(attention_mask, -10000.0) + return attention_scores + + def attention_fn( + self, + query_layer, + key_layer, + value_layer, + attention_mask, + attention_dropout=None, + log_attention_weights=None, + scaling_attention_score=True, + mems=None, + **kwargs + ): + + mem = mems[kwargs["layer_id"]] if mems is not None else None + + # seqlen, batch, head, hidden_size + seq_len, b, nh, hidden_size = key_layer.shape + + # b, seqlen, stack, head, hidden + cache_kv = ( + torch.stack((key_layer, value_layer)) + .permute(2, 1, 0, 3, 4) + .detach() + .contiguous() + .view(b, seq_len, nh * hidden_size * 2) + ) + kwargs["output_this_layer"]["mem_kv"] = cache_kv + + if mem is not None: # the first time, mem is None + # might change batch_size + # b, seqlen, stack, head, hidden -> stack, seqlen, b, head, hidden + mem = mem.expand(b, -1, -1).reshape(b, mem.shape[1], 2, nh, hidden_size).permute(2, 1, 0, 3, 4) + memk, memv = mem[0], mem[1] + key_layer = torch.cat((memk, key_layer), dim=0) + value_layer = torch.cat((memv, value_layer), dim=0) + + query_key_layer_scaling_coeff = float(kwargs["layer_id"] + 1) + if scaling_attention_score: + query_layer = query_layer / (math.sqrt(self.hidden_size_per_attention_head) * query_key_layer_scaling_coeff) + + # =================================== + # Raw attention scores. [b, np, s, s] + # =================================== + + # [b, np, sq, sk] + output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) + + matmul_result = torch.empty( + output_size[0] * output_size[1], + output_size[2], + output_size[3], + dtype=query_layer.dtype, + device=torch.cuda.current_device(), + ) + + matmul_result = torch.baddbmm( + matmul_result, + query_layer.transpose(0, 1), # [b * np, sq, hn] + key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] + beta=0.0, + alpha=1.0, + ) + + # change view to [b, np, sq, sk] + attention_scores = matmul_result.view(*output_size) + + # if log_attention_weights is not None: + # attention_scores += log_attention_weights + + if self.scale_mask_softmax: + self.scale_mask_softmax.scale = query_key_layer_scaling_coeff + attention_probs = self.scale_mask_softmax(attention_scores, attention_mask.contiguous()) + else: + if not (attention_mask.shape[-2] == 1 and (attention_mask > 0).all()): + # if auto-regressive, skip + attention_scores.masked_fill_(attention_mask, -10000.0) + + attention_scores = attention_scores.float() + attention_scores = attention_scores * query_key_layer_scaling_coeff + + attention_probs = F.softmax(attention_scores, dim=-1) + + attention_probs = attention_probs.half() + + if attention_dropout is not None: + if mpu.get_cuda_rng_tracker is not None: + with mpu.get_cuda_rng_tracker().fork(): + attention_probs = attention_dropout(attention_probs) + else: + attention_probs = attention_dropout(attention_probs) + + # ========================= + # Context layer. [sq, b, hp] + # ========================= + + # value_layer -> context layer. + # [sk, b, np, hn] --> [b, np, sq, hn] + + # context layer shape: [b, np, sq, hn] + output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) + + # change view [sk, b * np, hn] + value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) + + # change view [b * np, sq, sk] + attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) + + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) + + # change view [b, np, sq, hn] + context_layer = context_layer.view(*output_size) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + return context_layer + + +class FinalForwardMixin(BaseMixin): + def __init__(self): + super().__init__() + + def final_forward(self, logits, **kw_args): + return F.linear(logits, self.transformer.word_embeddings.weight).transpose(0, 1).contiguous() + + +class NonePositionEmbedding(BaseMixin): + def __init__(self): + super().__init__() + + def position_embedding_forward(self, position_ids, output_cross_layer, **kw_args): + return None + + +class WordEmbedding(BaseMixin): + def __init__(self): + super().__init__() + + def word_embedding_forward(self, input_ids, output_cross_layer, **kw_args): + return self.transformer.word_embeddings(input_ids).transpose(0, 1) + + +class GLM130B(BaseModel): + def __init__(self, args, transformer=None, parallel_output=False): + super().__init__( + args, + params_dtype=torch.half if args.fp16 else torch.float, + transformer=transformer, + parallel_output=parallel_output, + ) + self.add_mixin("glu-deepnorm", DeepNormWithGLUMixin(args.num_layers, args.hidden_size, args.inner_hidden_size)) + self.add_mixin( + "fp32-softmax", + SelfAttentionWithFP32SoftmaxMixin(args.hidden_size, args.num_attention_heads, args.model_parallel_size), + ) + self.add_mixin("final-forward", FinalForwardMixin()) + self.add_mixin("non-position-embedding", NonePositionEmbedding()) + del self.transformer.position_embeddings + self.add_mixin("word-embedding", WordEmbedding()) + self.add_mixin( + "rotary-embedding", + RotaryEmbeddingMixin( + args.fp16, + args.hidden_size, + args.num_attention_heads, + args.model_parallel_size, + args.position_encoding_2d + ), + ) + if not args.no_glu: + self.get_mixin("glu-deepnorm").reinit() + + @classmethod + def add_model_specific_args(cls, parser): + parser.add_argument('--position-encoding-2d', action='store_true', help='Use 2D rotary embedding.') + parser.add_argument('--no-glu', action='store_true', help='Disable GLU.') diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/model/transformer.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/model/transformer.py new file mode 100755 index 0000000000000000000000000000000000000000..d65322fc77da639ec13971cbab830610254d57d3 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/model/transformer.py @@ -0,0 +1,590 @@ +# coding=utf-8 +# rewritten, Copyright (c) 2021, Ming Ding. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Transformer.""" + +import math +import copy +import torch +import torch.nn.functional as F + +from SwissArmyTransformer import mpu +from SwissArmyTransformer.mpu.initialize import get_model_parallel_world_size +from SwissArmyTransformer.mpu.layers import ColumnParallelLinear, RowParallelLinear, VocabParallelEmbedding +from SwissArmyTransformer.mpu.mappings import gather_from_model_parallel_region, copy_to_model_parallel_region + +from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint + +from SwissArmyTransformer.mpu.utils import divide, sqrt, scaled_init_method, unscaled_init_method, gelu +from SwissArmyTransformer.mpu.utils import split_tensor_along_last_dim +from SwissArmyTransformer.ops import LayerNorm + +from SwissArmyTransformer.transformer_defaults import HOOKS_DEFAULT, standard_attention + + +class SelfAttention(torch.nn.Module): + def __init__(self, hidden_size, num_attention_heads, + attention_dropout_prob, output_dropout_prob, + init_method, layer_id, hidden_size_per_attention_head=None, output_layer_init_method=None, bias=True, + hooks={}, transformer_pointer=None, params_dtype=torch.float, skip_init=False, device=torch.device('cpu')): + super(SelfAttention, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + self.hooks = hooks + self.layer_id = layer_id + # Per attention head and per partition values. + world_size = get_model_parallel_world_size() + self.hidden_size = hidden_size + if hidden_size_per_attention_head is None: + self.hidden_size_per_attention_head = divide(hidden_size, num_attention_heads) + else: + self.hidden_size_per_attention_head = hidden_size_per_attention_head + self.num_attention_heads_per_partition = divide(num_attention_heads, world_size) + self.inner_hidden_size = num_attention_heads * self.hidden_size_per_attention_head + self.hidden_size_per_partition = self.hidden_size_per_attention_head * self.num_attention_heads_per_partition + + # Strided linear layer. + self.query_key_value = ColumnParallelLinear( + hidden_size, + 3 * self.inner_hidden_size, + stride=3, + gather_output=False, + init_method=init_method, + bias=bias, + params_dtype=params_dtype, + module=self, + name="query_key_value", + skip_init=skip_init, + device=device + ) + self.attention_dropout = torch.nn.Dropout(attention_dropout_prob) + + self.dense = RowParallelLinear( + self.inner_hidden_size, + hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method, + bias=bias, + params_dtype=params_dtype, + module=self, + name="dense", + skip_init=skip_init, + device=device + ) + self.output_dropout = torch.nn.Dropout(output_dropout_prob) + + object.__setattr__(self, 'transformer', transformer_pointer) + assert transformer_pointer is not None + + def _transpose_for_scores(self, tensor): + """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with + size [b, np, s, hn]. + """ + new_tensor_shape = tensor.size()[:-1] + \ + (self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + tensor = tensor.view(*new_tensor_shape) + return tensor.permute(0, 2, 1, 3) + + def forward(self, hidden_states, mask, *args, **kw_args): + if 'attention_forward' in self.hooks: + return self.hooks['attention_forward'](hidden_states, mask, **kw_args) + else: + return HOOKS_DEFAULT['attention_forward'](self, hidden_states, mask, **kw_args) + + +class CrossAttention(torch.nn.Module): + """Parallel cross-attention layer for Transformer""" + + def __init__(self, hidden_size, num_attention_heads, attention_dropout_prob, output_dropout_prob, init_method, + layer_id, hidden_size_per_attention_head=None, output_layer_init_method=None, bias=True, hooks={}, + transformer_pointer=None, params_dtype=torch.float, skip_init=False, device=torch.device('cpu')): + super().__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + self.hooks = hooks + self.layer_id = layer_id + # Per attention head and per partition values. + world_size = get_model_parallel_world_size() + self.hidden_size = hidden_size + if hidden_size_per_attention_head is None: + self.hidden_size_per_attention_head = divide(hidden_size, num_attention_heads) + else: + self.hidden_size_per_attention_head = hidden_size_per_attention_head + self.num_attention_heads_per_partition = divide(num_attention_heads, world_size) + self.inner_hidden_size = num_attention_heads * self.hidden_size_per_attention_head + self.hidden_size_per_partition = self.hidden_size_per_attention_head * self.num_attention_heads_per_partition + # Strided linear layer. + self.query = ColumnParallelLinear(hidden_size, self.inner_hidden_size, + gather_output=False, + init_method=init_method, bias=bias, params_dtype=params_dtype, module=self, name="query", skip_init=skip_init, device=device) + self.key_value = ColumnParallelLinear(hidden_size, 2 * self.inner_hidden_size, + stride=2, + gather_output=False, + init_method=init_method, bias=bias, params_dtype=params_dtype, module=self, name="key_value", + skip_init=skip_init, device=device) + # Dropout. Note that for a single iteration, this layer will generate + # different outputs on different number of parallel partitions but + # on average it should not be partition dependent. + self.attention_dropout = torch.nn.Dropout(attention_dropout_prob) + + # Output. + self.dense = RowParallelLinear( + self.inner_hidden_size, + hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method, bias=bias, params_dtype=params_dtype, module=self, name="dense",skip_init=skip_init, + device=device) + self.output_dropout = torch.nn.Dropout(output_dropout_prob) + + object.__setattr__(self, 'transformer', transformer_pointer) + assert transformer_pointer is not None + + def _transpose_for_scores(self, tensor): + """Transpose a 3D tensor [b, s, np*hn] into a 4D tensor with + size [b, np, s, hn]. + """ + new_tensor_shape = tensor.size()[:-1] + \ + (self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + tensor = tensor.view(*new_tensor_shape) + return tensor.permute(0, 2, 1, 3) + + def forward(self, hidden_states, cross_attention_mask, encoder_outputs, **kw_args): + # hidden_states: [b, s, h] + if 'cross_attention_forward' in self.hooks: + return self.hooks['cross_attention_forward'](hidden_states, cross_attention_mask, encoder_outputs, **kw_args) + else: + return HOOKS_DEFAULT['cross_attention_forward'](self, hidden_states, cross_attention_mask, encoder_outputs, **kw_args) + + +class MLP(torch.nn.Module): + def __init__(self, hidden_size, output_dropout_prob, init_method, inner_hidden_size=None, + output_layer_init_method=None, layer_id=None, hooks={}, bias=True, activation_func=gelu, transformer_pointer=None, params_dtype=torch.float, skip_init=False, device=torch.device('cpu')): + super(MLP, self).__init__() + self.layer_id = layer_id + self.activation_func = activation_func + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + self.hooks = hooks + # Project to 4h. + self.hidden_size = hidden_size + if inner_hidden_size is None: + inner_hidden_size = 4 * hidden_size + self.inner_hidden_size = inner_hidden_size + self.dense_h_to_4h = ColumnParallelLinear( + self.hidden_size, + self.inner_hidden_size, + gather_output=False, + init_method=init_method, + bias=bias, + params_dtype=params_dtype, + module=self, + name="dense_h_to_4h", + skip_init=skip_init, + device=device + ) + # Project back to h. + self.dense_4h_to_h = RowParallelLinear( + self.inner_hidden_size, + self.hidden_size, + input_is_parallel=True, + init_method=output_layer_init_method, + bias=bias, + params_dtype=params_dtype, + module=self, + name="dense_4h_to_h", + skip_init=skip_init, + device=device + ) + self.dropout = torch.nn.Dropout(output_dropout_prob) + object.__setattr__(self, 'transformer', transformer_pointer) + assert transformer_pointer is not None + + + def forward(self, hidden_states, **kw_args): + if 'mlp_forward' in self.hooks: + output = self.hooks['mlp_forward'](hidden_states, **kw_args) + else: + output = HOOKS_DEFAULT['mlp_forward'](self, hidden_states, **kw_args) + + if self.training: + output = self.dropout(output) + return output + + +class BaseTransformerLayer(torch.nn.Module): + def __init__( + self, + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + init_method, + layer_id, + inner_hidden_size=None, + hidden_size_per_attention_head=None, + output_layer_init_method=None, + layernorm_order='pre', + layernorm=LayerNorm, + is_decoder=False, + use_bias=True, + activation_func=gelu, + hooks={}, + transformer_pointer=None, + params_dtype=torch.float, + skip_init=False, + device=torch.device('cpu') + ): + super(BaseTransformerLayer, self).__init__() + # Set output layer initialization if not provided. + if output_layer_init_method is None: + output_layer_init_method = init_method + self.layer_id = layer_id + self.is_decoder = is_decoder + self.layernorm_order = layernorm_order + self.hooks = hooks + object.__setattr__(self, 'transformer', transformer_pointer) + assert transformer_pointer is not None + + # Layernorm on the input data. + self.input_layernorm = layernorm(hidden_size, eps=layernorm_epsilon) + + # Self attention. + self.attention = SelfAttention( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + init_method, + layer_id, + hidden_size_per_attention_head=hidden_size_per_attention_head, + output_layer_init_method=output_layer_init_method, + bias=use_bias, + hooks=hooks, + transformer_pointer=transformer_pointer, + params_dtype=params_dtype, + skip_init=skip_init, + device=device + ) + + # Layernorm on the input data. + self.post_attention_layernorm = layernorm(hidden_size, eps=layernorm_epsilon) + if self.layernorm_order == 'sandwich': + self.third_layernorm = layernorm(hidden_size, eps=layernorm_epsilon) + self.fourth_layernorm = layernorm(hidden_size, eps=layernorm_epsilon) + + # Cross attention. + if self.is_decoder: + self.cross_attention = CrossAttention( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + init_method, + layer_id, + hidden_size_per_attention_head=hidden_size_per_attention_head, + output_layer_init_method=output_layer_init_method, + bias=use_bias, + hooks=hooks, + transformer_pointer=transformer_pointer, + params_dtype=params_dtype + ) + self.post_cross_attention_layernorm = layernorm(hidden_size, eps=layernorm_epsilon) + + # MLP + self.mlp = MLP( + hidden_size, + output_dropout_prob, + init_method, + inner_hidden_size=inner_hidden_size, + output_layer_init_method=output_layer_init_method, + bias=use_bias, + layer_id=layer_id, + activation_func=activation_func, + hooks=hooks, + transformer_pointer=transformer_pointer, + params_dtype=params_dtype, + skip_init=skip_init, + device=device + ) + + def forward(self, hidden_states, mask, *args, **kw_args): + return HOOKS_DEFAULT['layer_forward'](self, hidden_states, mask, *args, **kw_args) + + +class BaseTransformer(torch.nn.Module): + def __init__(self, + num_layers, + vocab_size, + hidden_size, + num_attention_heads, + max_sequence_length, + embedding_dropout_prob, + attention_dropout_prob, + output_dropout_prob, + checkpoint_activations, + checkpoint_num_layers=1, + layernorm_epsilon=1.0e-5, + init_method_std=0.02, + inner_hidden_size=None, + hidden_size_per_attention_head=None, + layernorm_order='pre', + parallel_output=True, + is_decoder=False, + use_bias=True, + activation_func=gelu, + layernorm=LayerNorm, + init_method=None, + use_final_layernorm=True, + hooks={}, + params_dtype=torch.float, + skip_init=False, + device=torch.device('cpu') + ): + super(BaseTransformer, self).__init__() + + # recording parameters + self.is_decoder = is_decoder + self.parallel_output = parallel_output + self.checkpoint_activations = checkpoint_activations + self.checkpoint_num_layers = checkpoint_num_layers + self.max_sequence_length = max_sequence_length + self.layernorm_order = layernorm_order + self.hooks = copy.copy(hooks) # hooks will be updated each forward + object.__setattr__(self, 'transformer', self) # to give the default hooks the same api as outer hooks + + # create embedding parameters + self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob) + + self.word_embeddings = VocabParallelEmbedding( + num_embeddings=vocab_size, embedding_dim=hidden_size, + params_dtype=params_dtype, skip_init=skip_init, device=device) + + self.position_embeddings = torch.nn.Embedding(max_sequence_length, hidden_size) + torch.nn.init.normal_(self.position_embeddings.weight, mean=0.0, std=init_method_std) + + # create all layers + if init_method is None: + self.output_layer_init_method = scaled_init_method(init_method_std, num_layers) + self.init_method = unscaled_init_method(init_method_std) + else: + self.output_layer_init_method = init_method + self.init_method = init_method + + def get_layer(layer_id): + return BaseTransformerLayer( + hidden_size, + num_attention_heads, + attention_dropout_prob, + output_dropout_prob, + layernorm_epsilon, + self.init_method, + layer_id, + inner_hidden_size=inner_hidden_size, + hidden_size_per_attention_head=hidden_size_per_attention_head, + output_layer_init_method=self.output_layer_init_method, + is_decoder=self.is_decoder, + layernorm_order=layernorm_order, + layernorm=layernorm, + use_bias=use_bias, + activation_func=activation_func, + hooks=self.hooks, + transformer_pointer=self, + params_dtype=params_dtype, + skip_init=skip_init, + device=device + ) + + self.layers = torch.nn.ModuleList( + [get_layer(layer_id) for layer_id in range(num_layers)]) + + # Final layer norm before output. + self.use_final_layernorm = use_final_layernorm + if use_final_layernorm: + self.final_layernorm = layernorm(hidden_size, eps=layernorm_epsilon) + + def forward(self, input_ids, position_ids, attention_mask, *, + output_hidden_states=False, **kw_args): + # sanity check + assert len(input_ids.shape) >= 2 + batch_size, query_length = input_ids.shape[:2] + + if attention_mask is None: + attention_mask = torch.ones(1, 1, device=input_ids.device).type_as( + next(self.parameters()) + ) # None means full attention + assert len(attention_mask.shape) == 2 or \ + len(attention_mask.shape) == 4 and attention_mask.shape[1] == 1 + + # initial output_cross_layer might be generated by word/position_embedding_forward + output_cross_layer = {} + + # embedding part + if 'word_embedding_forward' in self.hooks: + hidden_states = self.hooks['word_embedding_forward'](input_ids, output_cross_layer=output_cross_layer, **kw_args) + else: # default + hidden_states = HOOKS_DEFAULT['word_embedding_forward'](self, input_ids, output_cross_layer=output_cross_layer,**kw_args) + + if 'position_embedding_forward' in self.hooks: + position_embeddings = self.hooks['position_embedding_forward'](position_ids, output_cross_layer=output_cross_layer, **kw_args) + else: + assert len(position_ids.shape) <= 2 + assert position_ids.shape[-1] == hidden_states.shape[1], (position_ids.shape, hidden_states.shape) + position_embeddings = HOOKS_DEFAULT['position_embedding_forward'](self, position_ids, output_cross_layer=output_cross_layer, **kw_args) + if position_embeddings is not None: + hidden_states = hidden_states + position_embeddings + hidden_states = self.embedding_dropout(hidden_states) + + output_per_layers = [] + if self.checkpoint_activations: + # define custom_forward for checkpointing + def custom(start, end, kw_args_index, cross_layer_index): + def custom_forward(*inputs): + layers_ = self.layers[start:end] + x_, mask = inputs[0], inputs[1] + + # recover kw_args and output_cross_layer + flat_inputs = inputs[2:] + kw_args, output_cross_layer = {}, {} + for k, idx in kw_args_index.items(): + kw_args[k] = flat_inputs[idx] + for k, idx in cross_layer_index.items(): + output_cross_layer[k] = flat_inputs[idx] + # ----------------- + + output_per_layers_part = [] + for i, layer in enumerate(layers_): + output_this_layer_obj, output_cross_layer_obj = {}, {} + if 'layer_forward' in self.hooks: + layer_ret = self.hooks['layer_forward']( + x_, mask, layer_id=layer.layer_id, + **kw_args, **output_cross_layer, + output_this_layer=output_this_layer_obj, + output_cross_layer=output_cross_layer_obj + ) + else: + layer_ret = layer( + x_, mask, layer_id=layer.layer_id, + **kw_args, **output_cross_layer, + output_this_layer=output_this_layer_obj, + output_cross_layer=output_cross_layer_obj + ) + if isinstance(layer_ret, tuple): + layer_ret = layer_ret[0] # for legacy API + x_, output_this_layer, output_cross_layer = layer_ret, output_this_layer_obj, output_cross_layer_obj + if output_hidden_states: + output_this_layer['hidden_states'] = x_ + output_per_layers_part.append(output_this_layer) + + # flatten for re-aggregate keywords outputs + flat_outputs = [] + for output_this_layer in output_per_layers_part: + for k in output_this_layer: + # TODO add warning for depth>=2 grad tensors + flat_outputs.append(output_this_layer[k]) + output_this_layer[k] = len(flat_outputs) - 1 + for k in output_cross_layer: + flat_outputs.append(output_cross_layer[k]) + output_cross_layer[k] = len(flat_outputs) - 1 + # -------------------- + + return (x_, output_per_layers_part, output_cross_layer, *flat_outputs) + return custom_forward + + # prevent to lose requires_grad in checkpointing. + # To save memory when only finetuning the final layers, don't use checkpointing. + if self.training: + hidden_states.requires_grad_(True) + + l, num_layers = 0, len(self.layers) + chunk_length = self.checkpoint_num_layers + output_this_layer = [] + while l < num_layers: + args = [hidden_states, attention_mask] + # flatten kw_args and output_cross_layer + flat_inputs, kw_args_index, cross_layer_index = [], {}, {} + for k, v in kw_args.items(): + flat_inputs.append(v) + kw_args_index[k] = len(flat_inputs) - 1 + for k, v in output_cross_layer.items(): + flat_inputs.append(v) + cross_layer_index[k] = len(flat_inputs) - 1 + # -------------------- + hidden_states, output_per_layers_part, output_cross_layer, *flat_outputs = \ + checkpoint(custom(l, l + chunk_length, kw_args_index, cross_layer_index), *args, *flat_inputs) + + # recover output_per_layers_part, output_cross_layer + for output_this_layer in output_per_layers_part: + for k in output_this_layer: + output_this_layer[k] = flat_outputs[output_this_layer[k]] + for k in output_cross_layer: + output_cross_layer[k] = flat_outputs[output_cross_layer[k]] + # -------------------- + + output_per_layers.extend(output_per_layers_part) + l += chunk_length + else: + output_this_layer = [] + for i, layer in enumerate(self.layers): + args = [hidden_states, attention_mask] + + output_this_layer_obj, output_cross_layer_obj = {}, {} + + if 'layer_forward' in self.hooks: # customized layer_forward + layer_ret = self.hooks['layer_forward'](*args, + layer_id=torch.tensor(i), + **kw_args, + position_ids=position_ids, + **output_cross_layer, + output_this_layer=output_this_layer_obj, output_cross_layer=output_cross_layer_obj + ) + else: + layer_ret = layer(*args, layer_id=torch.tensor(i), **kw_args, **output_cross_layer, + output_this_layer=output_this_layer_obj, output_cross_layer=output_cross_layer_obj) + if isinstance(layer_ret, tuple): + layer_ret = layer_ret[0] # for legacy API + hidden_states, output_this_layer, output_cross_layer = layer_ret, output_this_layer_obj, output_cross_layer_obj + + if output_hidden_states: + output_this_layer['hidden_states'] = hidden_states + output_per_layers.append(output_this_layer) + + # Final layer norm. + if self.use_final_layernorm: + logits = self.final_layernorm(hidden_states) + else: + logits = hidden_states + + logits = copy_to_model_parallel_region(logits) + if 'final_forward' in self.hooks: + logits_parallel = self.hooks['final_forward'](logits, **kw_args) + else: + logits_parallel = HOOKS_DEFAULT['final_forward'](self, logits, **kw_args) + + if not self.parallel_output: + logits_parallel = logits_parallel.npu_format_cast(2) # only use for pytorch_npu + logits_parallel = logits_parallel.clone() # prevent NPU memory allocation error + logits_parallel = gather_from_model_parallel_region(logits_parallel) + + outputs = [logits_parallel] + outputs.extend(output_per_layers) + + return outputs diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/ops/layernorm.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/ops/layernorm.py new file mode 100755 index 0000000000000000000000000000000000000000..b1d689f469b0317c939c3328c9528cf0e7f4b2e8 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/SwissArmyTransformer/SwissArmyTransformer/ops/layernorm.py @@ -0,0 +1,15 @@ +# try: +# from apex.normalization.fused_layer_norm import FusedLayerNorm +# class LayerNorm(FusedLayerNorm): +# def __init__(self, *args, pb_relax=False, **kwargs): +# super().__init__(*args, **kwargs) +# self.pb_relax = pb_relax + +# def forward(self, x): +# if not self.pb_relax: +# return super().forward(x) +# return super().forward(x / (x.abs().max().detach() / 8)) +# except ModuleNotFoundError: +# print('Please install apex to use fused_layer_norm, fall back to torch.nn.LayerNorm') +# from torch.nn import LayerNorm +from torch.nn import LayerNorm \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/benchmark.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..c23e8069cfd9d7811937220d0a7e50aa83957caf --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/benchmark.py @@ -0,0 +1,20 @@ +import torch +import time +from initialize import initialize, initialize_model_and_tokenizer + +if __name__ == "__main__": + args = initialize(extra_args_provider=lambda parser: None) + model, tokenizer = initialize_model_and_tokenizer(args) + + for seq_len in [512, 1024, 2048]: + torch.distributed.barrier() + start = time.time() + with torch.no_grad(): + _, *_ = model( + torch.ones(1, seq_len, device=torch.cuda.current_device(), dtype=torch.int64), + torch.arange(seq_len, device=torch.cuda.current_device(), dtype=torch.int64).view(1, -1), + torch.randn(1, 1, seq_len, seq_len, device=torch.cuda.current_device()) < 0.5, + ) + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(f"Encode {seq_len}: {(time.time() - start) * 1000:.2f} ms") diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b.sh b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b.sh new file mode 100644 index 0000000000000000000000000000000000000000..e3f13db2b63f0ff59dda38bf7887d1aeb45697e4 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b.sh @@ -0,0 +1,15 @@ +MODEL_TYPE="glm-130b" +CHECKPOINT_PATH="" +MP_SIZE=8 +MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ + --num-layers 70 \ + --hidden-size 12288 \ + --inner-hidden-size 32768 \ + --vocab-size 150528 \ + --num-attention-heads 96 \ + --max-sequence-length 2048 \ + --tokenizer-type icetk-glm-130B \ + --layernorm-order post \ + --load ${CHECKPOINT_PATH} \ + --skip-init \ + --fp16" diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b_int4.sh b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b_int4.sh new file mode 100644 index 0000000000000000000000000000000000000000..391e1cbef7373cc54a3f5935e0d6fb1367998c26 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b_int4.sh @@ -0,0 +1,16 @@ +MODEL_TYPE="glm-130b" +CHECKPOINT_PATH="" +MP_SIZE=4 +MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ + --num-layers 70 \ + --hidden-size 12288 \ + --inner-hidden-size 32768 \ + --vocab-size 150528 \ + --num-attention-heads 96 \ + --max-sequence-length 2048 \ + --tokenizer-type icetk-glm-130B \ + --layernorm-order post \ + --quantization-bit-width 4 \ + --load ${CHECKPOINT_PATH} \ + --skip-init \ + --fp16" diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b_int8.sh b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b_int8.sh new file mode 100644 index 0000000000000000000000000000000000000000..5eb4ec88418028af2005c09eff8f60da84b04c35 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b_int8.sh @@ -0,0 +1,16 @@ +MODEL_TYPE="glm-130b" +CHECKPOINT_PATH="" +MP_SIZE=8 +MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ + --num-layers 70 \ + --hidden-size 12288 \ + --inner-hidden-size 32768 \ + --vocab-size 150528 \ + --num-attention-heads 96 \ + --max-sequence-length 2048 \ + --tokenizer-type icetk-glm-130B \ + --layernorm-order post \ + --quantization-bit-width 8 \ + --load ${CHECKPOINT_PATH} \ + --skip-init \ + --fp16" diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b_v100.sh b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b_v100.sh new file mode 100644 index 0000000000000000000000000000000000000000..0b33485e7f9f396b73bee37f5587f67c7b5822f2 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/configs/model_glm_130b_v100.sh @@ -0,0 +1,17 @@ +MODEL_TYPE="glm-130b" +CHECKPOINT_PATH="" +MP_SIZE=8 +MODEL_ARGS="--model-parallel-size ${MP_SIZE} \ + --num-layers 70 \ + --hidden-size 12288 \ + --inner-hidden-size 32768 \ + --vocab-size 150528 \ + --num-attention-heads 96 \ + --max-sequence-length 2048 \ + --tokenizer-type icetk-glm-130B \ + --layernorm-order post \ + --load ${CHECKPOINT_PATH} \ + --skip-init \ + --fp16 \ + --bminf \ + --bminf-memory-limit 25" diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/cuda/Makefile b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/cuda/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..46df11b60872a97e29b14aef55eb28c1e7ddba66 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/cuda/Makefile @@ -0,0 +1,22 @@ +NVCC=nvcc +OPTIONS=-gencode arch=compute_61,code=sm_61 \ + -gencode arch=compute_62,code=sm_62 \ + -gencode arch=compute_70,code=sm_70 \ + -gencode arch=compute_72,code=sm_72 \ + -gencode arch=compute_75,code=sm_75 \ + -gencode arch=compute_80,code=sm_80 \ + -gencode arch=compute_86,code=sm_86 + +TARGETS=$(patsubst %.cu, %.fatbin, $(wildcard *.cu)) + +all: $(TARGETS) + +%.fatbin: %.cu + $(NVCC) -fatbin $^ $(OPTIONS) -o $@ + +.PHONY : clean, copy +clean: + rm $(TARGETS) + +copy: + cp $(TARGETS) ../kernels/ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/cuda/quantization.cu b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/cuda/quantization.cu new file mode 100644 index 0000000000000000000000000000000000000000..36ac67d63c2e9a9de5a8bef3d87ac521f7391b62 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/cuda/quantization.cu @@ -0,0 +1,81 @@ +#include + +template +__device__ void +int4WeightExtractionDevice(const int8_t* weight, + const T* scale_list, + T* output, + const int n, + const int k) +{ + for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){ + int8_t original = weight[i]; + int8_t high = original >> 4; + int8_t low = original << 4; low = low >> 4; + output[i * 2] = T(high) * scale_list[blockIdx.x]; + output[i * 2 + 1] = T(low) * scale_list[blockIdx.x]; + } +} + +__device__ void +int4WeightCompressionDevice(const int8_t* input, + int8_t* output, + const int n, + const int k) +{ + for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){ + output[i] = (input[i * 2] << 4) | (input[i * 2 + 1] & 0b00001111); + } +} + +template +__device__ void +int8WeightExtractionDevice(const int8_t* weight, + const T* scale_list, + T* output, + const int n, + const int k) +{ + for(int i = blockIdx.x * k + threadIdx.x; i < blockIdx.x * k + k; i += blockDim.x){ + output[i] = T(weight[i]) * scale_list[blockIdx.x]; + } +} + +extern "C" __global__ void int4WeightExtractionHalf(const int8_t* weight, + const half* scale_list, + half* output, + const int n, + const int k){ + int4WeightExtractionDevice(weight, scale_list, output, n, k); + } + +extern "C" __global__ void int4WeightExtractionFloat(const int8_t* weight, + const float* scale_list, + float* output, + const int n, + const int k){ + int4WeightExtractionDevice(weight, scale_list, output, n, k); + } + +extern "C" __global__ void int8WeightExtractionHalf(const int8_t* weight, + const half* scale_list, + half* output, + const int n, + const int k){ + int8WeightExtractionDevice(weight, scale_list, output, n, k); + } + +extern "C" __global__ void int8WeightExtractionFloat(const int8_t* weight, + const float* scale_list, + float* output, + const int n, + const int k){ + int8WeightExtractionDevice(weight, scale_list, output, n, k); + } + +extern "C" __global__ void int4WeightCompression(const int8_t* input, + int8_t* output, + const int n, + const int k){ + int4WeightCompressionDevice(input, output, n, k); + } diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/evaluate-your-own-tasks.md b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/evaluate-your-own-tasks.md new file mode 100644 index 0000000000000000000000000000000000000000..396035f9bdc7951760916c2f041bdd7ec38e419f --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/evaluate-your-own-tasks.md @@ -0,0 +1,86 @@ +# Evaluate Your Own Tasks + +## YAML file for tasks + +We use the YAML file to define tasks, this allows us to easily evaluate multiple tasks at a single run and configure them independently. Specifically, you can add multiple tasks or folders at a time for evaluation, and the script will automatically collect all YAML files under those folders recursively. + +``` +# Single node +bash scripts/evaluate.sh task1.yaml task2.yaml dir1 dir2 ... +# Multi node +bash scripts/evaluate_multiple_node.sh task1.yaml task2.yaml dir1 dir2 ... +``` + +We support two types of evaluation tasks: multi-choice and generation. The YAML config options for both tasks are defined in `evaluation/configs.py`. Basically, all types of tasks share common configs defining task information: + +```yaml +name: 'glue_cola' # Task Name +type: 'mul' # Task type, 'gen' (generate) or 'mul' (multiple choice) +path: 'bloom/glue_cola' # task data path relative to DATA_PATH in 'evaluate.sh' +use_task_mask: False # Whether use [gMASK] for evaluation +unidirectional: False # Whether use unidirectional attention +max_seq_length: 2048 # Max sequence length +file-pattern: # Organize jsonl file in groups + validation: "**/validation.jsonl" # Will search for all file named 'validation.jsonl' in `DATA_PATH/bloom/glue_cola` using glob.glob() +micro-batch-size: 30 # 'gen' task only support mbs = 1 for now +``` + +See configuration details for multi-choice and generation tasks in `evaluation/configs.py`. + +## Data format for tasks + +We recommend organizing the task data in the following structure and setup up two groups named "validation" and "test" in the `file-pattern` config so that it becomes very easy to evaluate different prompts on both validation and test sets independently. + +```bash +DATA_PATH +└── task_name + ├── prompt_1 + │   ├── test.jsonl + │   └── val.jsonl + ├── prompt_2 + │   ├── test.jsonl + │   └── val.jsonl + └── prompt_3 + ├── test.jsonl + └── val.jsonl +``` + +The evaluation data for each prompt are organized into jsonline format. For multi-choice tasks, the format of each line of JSON should be + +```json +{ + "inputs_pretokenized": "Context and question here", + "choices_pretokenized": ["Choice 1", "Choice 2", "Choice 3"], + "label": int +} +``` + +The default metric for the multi-choice task is Accuracy. + +For the generation task, the format of each line of JSON should be + +```json +{ + "inputs_pretokenized": "Context and question here", + "targets_pretokenized": ["Target 1", "Target 2", "Target 3"], + "label": int +} +``` + +The default metrics for the generation task are EM(Exact-Match) and F1. Given inputs, the sequence generated by the model will be metricized separately from all targets and the highest value will be taken. + + +## Implement Your Metrics + +You can customize your evaluation metrics function and add it to `DEFAULT_METRICS` in `evaluation/metrics.py`, and then you can specify `metric: ['Your metric name']` in the task YAML file. + +## Fully customize the evaluation process + +By default, we implement classes named `MultiChoiceTask` and `GenerationTask` in `evaluation/tasks.py` for multi-choice tasks and generation tasks, respectively. + +You can implement a new task class and inherit from one of these two classes, and implement the `process_single_batch` function to define how to process a batch of inputs and get the predictions. Following [Big-Bench](https://github.com/google/BIG-bench/#creating-the-task), we implemented two methods you can use for your evaluation: + +- `model.cond_log_prob()`: Compute the probabilities of provided model outputs for given inputs. +- `model.generate_text()`: Generate text for given inputs. + +Once you have created the new task class, you need to specify the relative path to import the class in the `module` field of the task YAML file. See `tasks/lambada/tasks.py` and `tasks/lambada/lambada.yaml` for how we customize the beam search generation strategy for LAMBADA tasks and configure the YAML file. diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/inference-with-fastertransformer.md b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/inference-with-fastertransformer.md new file mode 100644 index 0000000000000000000000000000000000000000..7c55d56ec39ee6adfa66d35771a421933b595707 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/inference-with-fastertransformer.md @@ -0,0 +1,156 @@ +# Inference with FasterTransformer + +[FasterTransformer](https://github.com/NVIDIA/FasterTransformer) provides a script and recipe to run the highly optimized transformer-based encoder and decoder component, and it is tested and maintained by NVIDIA. + +We adapted the GLM-130B based on Fastertransformer for fast inference, with details in [benchmark](#benchmark) section. + +## Download the Model + +See [Get Model](/README.md#environment-setup). + +## Recommend: Run With Docker + +Use Docker to quickly build a Flask API application for GLM-130B. + +### Requirements + +- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) + +### Build Container Image + +```bash +git clone https://github.com/THUDM/FasterTransformer.git +cd FasterTransformer +bash docker/build.sh +``` + +### Run API With Checkpoints + +Set MPSIZE to the number of gpus needed for the checkpoints, and DATA_TYPE to checkpoints precision. The checkpoint we distribute is in 8-way tensor parallel in FP16 precision, a conversion scripts is also provided if you need to change the tensor parallel dimension and the weight precision. + +```bash +# Convert the checkpoint to MP=4, DATA_TYPE=INT4 +python tools/convert_tp.py \ + --input-folder \ + --output-folder \ + --target-tp 8 \ + --quantization-bit-width 4 \ +# Run API +docker run -it --rm --gpus all --shm-size=10g -p 5000:5000 \ + -v /49300:/checkpoints:ro \ + -e MPSIZE=4 -e DATA_TYPE=int4 \ + ftglm:latest +``` + +### Test + +#### Benchmark + +```bash +python3 examples/pytorch/glm/glm_server_test.py +``` + +#### Web Demo + +```bash +pip install gradio +python3 examples/pytorch/glm/glm_server_frontend_test.py +``` + +## Manual Configuration + +### Requirements + +- CMake >= 3.13 for PyTorch +- CUDA 11.0 or newer version +- NCCL 2.10 or newer version +- Python 3 is recommended because some features are not supported in python 2 +- PyTorch: Verify on 1.10.1, >= 1.8.0 should work. + +### Setup Using Docker + +```bash +docker run -it --rm --gpus all nvcr.io/nvidia/pytorch:22.09-py3 /bin/bash +conda install -y pybind11 +``` + +### Setup Using Conda + +As another way, all the packages can be installed using conda. + +> Some of our current [structure](https://github.com/THUDM/FasterTransformer/blob/main/src/fastertransformer/th_op/glm/GlmOp.h#L30) requires that `g++` and `libtorch` produce the same results, so a pre-compiled `libtorch` may only work with `g++-7` or `g++-9`. And although GLM-130B itself does not rely on openmpi, FasterTransformer requires it during the build process. We are working on these issues. + +```bash +conda install -y cmake pybind11 +conda install -y -c conda-forge cudatoolkit-dev cudnn +cp -r $CONDA_PREFIX/lib/libcudnn* /usr/local/cuda/lib64/ +cp -r $CONDA_PREFIX/include/cudnn*.h /usr/local/cuda/include/ +``` + +If it's hard to install cudatoolkit-dev and cudnn by conda, just install them from [NVIDIA Developer](https://developer.nvidia.com/cuda-downloads), and make sure cmake is able to find cudnn. + +```bash +cp cudnn/include/cudnn*.h /usr/local/cuda/include +cp cudnn/lib/libcudnn* /usr/local/cuda/lib64 +chmod a+r /usr/local/cuda/include/cudnn*.h +chmod a+r /usr/local/cuda/lib64/libcudnn* +``` + +GLM-130B is trained with FP16 precision, a total of 260G of GPU memory is required to store model weights. The model is tested with 8 * 40G A100s. + +### Build + +Get the code and install all dependencies: + +```bash +git clone https://github.com/THUDM/FasterTransformer.git +mkdir -p FasterTransformer/build +cd FasterTransformer/build +pip3 install icetk transformers +``` + +Note: the `xx` of `-DSM=xx` in following scripts means the compute capability of your GPU. For example, 60 (P40) or 61 (P4) or 70 (V100) or 75(T4) or 80 (A100) or 86(RTX 3090). Default setting is including 70, 75, 80 and 86. + +```bash +cmake -DSM=80 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON .. +make -j +``` + +### Run GLM-130B + +Generate the `gemm_config.in` file. + +```bash +# ./bin/gpt_gemm +./bin/gpt_gemm 1 1 128 96 128 49152 150528 1 8 +``` + +Running GLM_130B in Pytorch and Flask. + +```bash +bash ../examples/pytorch/glm/glm-server.sh +``` + +You need to check and edit this file to set arguments such as `CHECKPOINT_PATH`. + +## Optimization methods + +Optimization in GLM_130B are similar to optimization in GPT and GPT-J, describing in the [FasterTransformer/gpt_guide.md](https://github.com/NVIDIA/FasterTransformer/blob/main/docs/gpt_guide.md). Meanwhile, some of the operators are differ from GPT, such as the implementation of RotaryEmbedding, and the use of GeGLU, so we add them additionally into FasterTransformer. + +## Benchmark + +- Hardware: DGX-A100(8 * 40G) + +## Encode + +| **Sequence Len** | 512 | 1024 | 2048 | +| ---------- | ------ | ------ | ------ | +| Megatron | 145 ms | 250 ms | 453 ms | +| FasterTransformer | 120 ms | 220 ms | OOM | + +## Decode + +| **Sequence Len** | 512 | 1024 | 2048 | +| ---------- | ------- | ------- | -------- | +| Megatron | 45.21 s | 89.00 s | 179.22 s | +| FasterTransformer | 18.77 s | 39.81 s | 89.88 s | diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/low-resource-inference.md b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/low-resource-inference.md new file mode 100644 index 0000000000000000000000000000000000000000..5dbea233934f59651ca276f6640e1b09753c9c46 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/low-resource-inference.md @@ -0,0 +1,28 @@ +# Low-resource Inference with BMInf + +GLM-130B is trained with 4-way tensor parallel and 8-way pipeline parallel for efficiency. Then the checkpoint is converted into a 8-way tensor parallel one in order to inference the model in a single node. GLM-130B has 130 billion parameters in FP16 precision, a total of 260G of GPU memory is required to store model weights. The DGX-A100 server has 8 A100s and provides an amount of 320G of GPU memory (640G for 80G A100 version) so it suits GLM-130B well. + +However, a server with 8 * 32G V100 only provides an amount of 256G of GPU memory, which indicates that the full loading of model weights is not possible. Fortunately, with the swap-in-and-out feature between CPU and GPU memory provided by the [BMInf](https://github.com/OpenBMB/BMInf) library, GLM-130B can still run on servers with a smaller amount of GPU memory. After joint debugging with the BMInf team, we achieved a resonable evaluation efficiency on DGX-1 servers with 8 * 32G V100 by carefully overlapping computation and communication, see the [benchmark section](#benchmark) for details. + +We have integrated BMInf into our codebase, just install BMInf via `pip install bminf`, and change the model configuration file from `configs/model_glm_130b.sh` to `configs/model_glm_130b_v100.sh` in your launch shell script. The default BMInf config is for V100 servers, you can also adjust the maximum memory the model weights can occupy on one GPU by setting `--bminf-memory-limit` according to your GPU memory in the model config file. + +## Benchmark + +### Evaluation + +- CoLA task on the validation set +- Micro Batch Size = 30 +- BMInf: 25GB model weights in GPU memory limit by: `--bminf-memory-limit 25` + +| | Peak GPU Memory | Time | +| -------------- | ---------- | ------ | +| A100-SAT | 40.3 G | 74.6 s | +| V100-SAT | OOM | OOM | +| V100-SAT-BMInf | 32.3 G | 196.0 s | + +The `micro-batch-size` config in task YAML files is configured according to the maximum utilization of the DGX-A100 server. If you encounter an OOM error on the V100 server, please adjust the `micro-batch-size` appropriately. + +### Text generation + +In text generation, due to the small amount of calculation per model forward (usually <10 tokens/forward using beam search strategy), the communication between the CPU and GPU memory becomes the bottleneck. With the help of the BMInf team, we did an in-depth profile on our V100 server. Given a 25GB model weight limit per GPU, a total of 13 layers need to be copied from CPU to GPU for a single forward, each layer will take about 75ms on IO, indicating that the real IO speed between CPU and GPU is `260GB / 70 / 8 / 75ms = 6.19GB/s`. Our V100 server uses PCI-E 3.0 and two V100s share a switch, so the theoretical bandwidth for each GPU is 8GB/s, close to our profiling results. A server with PCI-E 4.0 will greatly reduce the IO time. Even that, long text generation tokens can still take several minutes so **we do not recommend using V100 servers in text generation scenario**. For this, we are working on INT8 quantization so that GLM-130B can even fit a single RTX-3090 server (24G * 8). + diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/quantization.md b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/quantization.md new file mode 100644 index 0000000000000000000000000000000000000000..3fc869c0cbcf4e0f7898f574d20d92a6614c94dc --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/docs/quantization.md @@ -0,0 +1,66 @@ +# Quantization of GLM-130B + +## Usage + +> Please note that SwissArmyTransformer>=0.2.11 is required for quantization + +Set `CHECKPOINT_PATH` in `configs/model_glm_130b_{int4/int8}.sh` to your local checkpoint folder. The model will be first initialized from the FP16 checkpoint on the CPU memory, then dynamically quantized and transferred to the GPU memory. So please make sure you have enough CPU memory (>260GB) to store the FP16 model weights. + +You need to pay attention to the tensor parallel dimension of the model checkpoint, we only provide the checkpoint in 8-way tensor parallel, i.e. 8 GPUs store a whole model. If you need to do inference on a small number of GPUs, e.g. 4 * RTX 3090 GPUs with INT4 precision, you first need to convert the checkpoint to 4-way tensor parallel using the following command and modify `MP_SIZE` in corresponding model config file. + +```bash +python tools/convert_tp.py \ + --input-folder \ + --output-folder \ + --target-tp 4 +``` + +Finally, change the model config file from `configs/model_glm_130b.sh` to `configs/model_glm_130b_{int4/int8}.sh` in your scripts (e.g. `scripts/generate.sh`), then run your scripts just as normal. + +By default, the full precision checkpoint is expected to be loaded. Run the conversion script with `--quantization-bit-width <4 or 8>` will produce quantized model weights. To load from a quantized checkpoint, you should add `--from-quantized-checkpoint` in your model config file. + +## Evaluation Results + +| | **MMLU(Accuracy↑)** | **LAMBADA(Accuracy↑ )** | **WikiText-2(PPL↓)** | **WikiText-103(PPL↓)** | **PTB(PPL↓)** | +| ---- | -------- | ----------- | ------------------- | --------------------- | ------------ | +| FP16 | 44.751 | 80.206 | 10.901 | 10.759 | 18.964 | +| INT8 | 44.709 | 80.206 | 10.904 | 10.763 | 18.994 | +| INT4 | 44.801 | 79.468 | 11.167 | 11.046 | 19.535 | + +## Space and Speed Benchmark + +| **Hardware** | **GPU Memory** | **Precison** | **512** | **1024** | **2048** | +| ------------ | -------------- | ------------ | -------- | -------- | -------- | +| 8 * A100 | 40 GB | FP16 | 45.21 s | 89.00 s | 179.22 s | +| 8 * V100 | 32 GB | INT8 | 106.35 s | 216.50 s | 449.17 s | +| 4 * RTX 3090 | 24 GB | INT4 | 138.66 s | 292.69 s | 649.64 s | +| 8 * RTX 2080 Ti | 11 GB | INT4 | 117.39 s | 240.96 s | 528.66 s | + + +The above results in the table is tests with SAT. Using FasterTransformer can speed up more than 2X, as shown in the table below, and the detailed usage is shown in [Inference with FasterTransformer](../docs/inference-with-fastertransformer.md). + +| **Hardware** | **GPU Memory** | **Precison** | **128** Encode / Decode | **512** Encode / Decode | **1024** Encode / Decode | **2048** Encode / Decode | +| --------------- | -------------- | ------------ | ----------------------- | ----------------------- | ------------------------ | ------------------------ | +| 8 * A100 | 40 GB | INT4 | 145 ms / 4.29 s | 183 ms / 17.7 s | 313 ms / 37.8 s | 495 ms / 86.0 s | +| 4 * A100 | 80 GB | INT4 | 174 ms / 6.62 s | 272 ms / 27.1 s | 439 ms / 56.2 s | 810 ms / 123 s | +| 8 * V100 | 32 GB | INT4 | 309 ms / 6.97 s | 666 ms / 28.1 s | 1208 ms / 58.4 s | 2304 ms / 125 s | +| 4 * V100 | 32 GB | INT4 | 448 ms / 11.4 s | 843 ms / 45.87 s | 1488 ms / 93.5 s | 2803 ms / 196 s | +| 8 * RTX 3090 | 24 GB | INT4 | 283 ms / 5.07 s | 915 ms / 20.5 s | 1793 ms / 42.7 s | 3477 ms / 90.3 s | +| 4 * RTX 3090 | 24 GB | INT4 | 374 ms / 8.16 s | 1300 ms / 32.3 s | OOM / 66.5 s | OOM / 150 s | +| 8 * RTX 2080 Ti | 11 GB | INT4 | 392 ms / 6.77 s | 1044 ms / 27.29 s | OOM / 56.02 s | OOM / OOM | + +## Details + +Typical methods quantize both model weights and activations to INT8, enabling the INT8 matrix multiplication kernel for efficiency. However, we found that there are outliers in GLM-130B's activations, making it hard to reduce the precision of activations. + +Concurrently, researchers from [Meta AI](https://arxiv.org/abs/2208.07339) also found the emergent outliers issue in large-scale transformers (>6.8B), which is consistent with our observations on GLM-130B. They conducted an in-depth analysis and found that the outliers make up only about 0.1% of all feature dimensions, so it's possible to make a decomposition for matrix multiplication that focuses on high precision multiplication for these particular dimensions. + +| ![](media/16613396005977.jpg) | +|:--:| +| *Distribution of outliers (the white ones) in GLM-130B's activation* | + +Unfortunately, the outliers in GLM-130B can sometimes make up at most 30% of the feature dimension, possibly because we used GLU as a variant of FFN. Therefore, a mixed-precision decomposition for matmul can be much less efficient than a single FP16 matmul. After a few weeks of trial, we finally decided to keep the precision of activations to FP16 and only consider the quantization of model weights. In that case, the quantized model parameters are dynamically converted to FP16 precision at runtime, introducing a small computational overhead but greatly reducing GPU memory requirements for storing model weights. + +We quantized all linear layers as they take up most of the model parameters. All model weights, excluding input/output embedding, layernorm and bias terms are quantized using vector-wise symmetric quantization. At the quantization precision of INT4, two INT4 weights are compressed into one INT8 weight for saving GPU memory usage, so that only 70GB of GPU memory approximately is required for INT4 model weights. + + diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluate.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..fef259f8aa9ba01b7f49da0b53a4ac71e0884463 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluate.py @@ -0,0 +1,67 @@ +import time +import importlib + +from os.path import join, isdir, isfile, relpath +from glob import glob + +from evaluation import BaseConfig, ModelForEvaluation, DEFAULT_CLASS, print_rank_0 +from initialize import initialize, initialize_model_and_tokenizer + + +def add_evaluation_specific_args(parser): + """Arguments for evaluation""" + group = parser.add_argument_group("evaluation", "Evaluation configurations") + + # Task + group.add_argument("--task", nargs="+", default=[], help="All task config to evaluation") + group.add_argument("--data-path", type=str, required=True, help="Data dir path for all tasks") + return parser + + +def find_all_tasks(all_task_config_path): + tasks = [] + for task in all_task_config_path: + if isdir(task): + tasks += [relpath(path, ".") for path in glob(join(task, "**/*.yaml"), recursive=True)] + elif isfile(task): + tasks.append(task) + return tasks + + +def evaluate_all_tasks(data_path, model, tokenizer, all_task_config_path, task_classes): + for config_path, task_class in zip(all_task_config_path, task_classes): + config = task_class.config_class().from_yaml_file(config_path) + config.path = join(data_path, config.path) + task = task_class(model, tokenizer, config) + task.evaluate() + + +def main(): + args = initialize(extra_args_provider=add_evaluation_specific_args) + args.task = find_all_tasks(args.task) + + task_classes = [] + print_rank_0("> Loading task configs") + for task_config_path in args.task: + config = BaseConfig.from_yaml_file(task_config_path) + if config.module: + path = ".".join(config.module.split(".")[:-1]) + module = importlib.import_module(path) + class_name = config.module.split(".")[-1] + task_class = getattr(module, class_name) + task_classes.append(task_class) + else: + task_classes.append(DEFAULT_CLASS[config.type]) + print_rank_0(f" Task {config.name} loaded from config {task_config_path}") + print_rank_0(f"> Successfully load {len(task_classes)} task{'s' if len(task_classes) > 1 else ''}") + + model, tokenizer = initialize_model_and_tokenizer(args) + model = ModelForEvaluation(model) + + start = time.time() + evaluate_all_tasks(args.data_path, model, tokenizer, args.task, task_classes) + print_rank_0(f"Finish {len(task_classes)} task{'s' if len(task_classes) > 1 else ''} in {time.time() - start:.1f}s") + + +if __name__ == "__main__": + main() diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/__init__.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9a28fec7230d0c914da47789b8e4c298b3d4d7e --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/__init__.py @@ -0,0 +1,12 @@ +from .configs import * +from .model import ModelForEvaluation +from .tasks import BaseTask, GenerationTask, MultiChoiceTask, LanguageModelTask +from .dataset import GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset +from .metrics import qa_evaluate +from .utils import print_rank_0 + +DEFAULT_CLASS = { + TaskType.GENERATION: GenerationTask, + TaskType.MULTICHOICE: MultiChoiceTask, + TaskType.LANGUAGE_MODEL: LanguageModelTask, +} diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/configs.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/configs.py new file mode 100644 index 0000000000000000000000000000000000000000..e2982c0fe0475336083f5e9f16290ce1ef37fe7e --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/configs.py @@ -0,0 +1,59 @@ +from __future__ import annotations +from dataclass_wizard import YAMLWizard +from dataclasses import dataclass, field +from enum import Enum +from typing import Optional, List, Dict + + +class TaskType(Enum): + MULTICHOICE = "mul" + GENERATION = "gen" + LANGUAGE_MODEL = "lm" + OTHER = "other" + + +@dataclass +class BaseConfig(YAMLWizard): + name: str # Task name + type: TaskType # Task type + path: str # task data path relative to DATA_PATH + + module: Optional[str] = None # Custom task module file, optional + metrics: List[str] = field(default_factory=list) # Evaluation metrics + + use_task_mask: bool = False # Whether to use [gMASK] for evaluation + use_multitask_encoding: bool = False # Not supported now + unidirectional: bool = False # Whether to use unidirectional attention + max_seq_length: int = 2048 # Max sequence length + file_pattern: str | Dict[str, str] = "**/*.json*" # Organize data file in groups + + micro_batch_size: int = 1 # 'gen' task only support mbs = 1 for now + + def __post_init__(self): + assert self.use_task_mask or not self.unidirectional, "[MASK] doesn't support unidirectional attention" + + +@dataclass +class MultiChoiceTaskConfig(BaseConfig): + module = "evaluation.MultiChoiceTask" + metrics: List[str] = field(default_factory=lambda: ["Accuracy"]) + + +@dataclass +class GenerationTaskConfig(BaseConfig): + module = "evaluation.GenerationTask" + metrics: List[str] = field(default_factory=lambda: ["EM", "F1"]) + sampling_strategy: str = "BaseStrategy" + num_beams: int = 4 + length_penalty: float = 1.0 + no_repeat_ngram_size: int = 3 + min_gen_length: int = 0 + max_gen_length: int = 128 + + +@dataclass +class LanguageModelTaskConfig(BaseConfig): + module = "evaluation.LanguageModelTask" + metrics: List[str] = field(default_factory=lambda: ["PPL"]) + + generation_length: int = 256 # Generated length in each window diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/dataset.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..ac9520d1b04847ae817bfcb872dfc8e4c66b73c9 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/dataset.py @@ -0,0 +1,371 @@ +import os +import math +import json + +import numpy as np +import torch + +from typing import List, Union +from abc import ABC, abstractmethod +from scipy.linalg import block_diag +from itertools import accumulate +from bisect import bisect_right + +from SwissArmyTransformer import get_tokenizer + +from .configs import BaseConfig, MultiChoiceTaskConfig, GenerationTaskConfig, LanguageModelTaskConfig +from .utils import get_tokenized_input + + +def pad_batch(tokens, position_ids, attention_mask, max_seq_length): + attention_mask = np.pad( + attention_mask, + pad_width=((0, max_seq_length - len(tokens)),), + mode="constant", + constant_values=0, + ) + tokens = np.concatenate((tokens, np.zeros(max_seq_length - len(tokens), dtype=np.int64))) + position_ids = np.concatenate((position_ids, np.zeros(max_seq_length - len(position_ids), dtype=np.int64))) + return tokens, position_ids, attention_mask + + +class EvaluationDataset(torch.utils.data.Dataset, ABC): + """ + Jsonlines of { + "text": context + "choices": [choice_id1,...], if not None, len(target) == 1 + "label": If generation task -1, else [0, len(choices)) + } + If [MASK] not in context, will append [MASK] after text + """ + + def __init__(self, path: Union[str, List[str]], config: BaseConfig): + self.path = path if isinstance(path, list) else [path] + self.config = config + self.max_seq_length = self.config.max_seq_length + self.dtype = np.int64 + + self.tokenizer = get_tokenizer() + self.mask_id = self.tokenizer.get_command("[MASK]") + self.gmask_id = self.tokenizer.get_command("[gMASK]") + + self.data = [] + for p in self.path: + self.process_single_file(p) + + @property + def has_collate_fn(self) -> bool: + return False + + def collate_fn(self, samples): + return None + + def process_single_file(self, path): + with open(os.path.join(path), "r", encoding="utf-8") as file: + for line in file: + item = json.loads(line) + self.data.append(self.process_single_item(item)) + + @abstractmethod + def process_single_item(self, item) -> dict: + pass + + def __len__(self): + return len(self.data) + + +class GenerationTaskDataset(EvaluationDataset): + config: GenerationTaskConfig + + def process_single_item(self, item): + text, targets = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "targets") + if len(text) + self.config.max_gen_length + 2 > self.config.max_seq_length: + text_length = self.config.max_seq_length - self.config.max_gen_length - 2 + text = text[len(text) - text_length : len(text)] + return {"text": text, "targets": targets} + + @property + def has_collate_fn(self) -> bool: + return True + + def collate_fn(self, samples): + TILE = 32 + length_to_pad = (max(map(lambda spl: len(spl["token"]), samples)) + TILE - 1) // TILE * TILE + + token_batch, position_id_batch, attention_mask_batch = [], [], [] + context_length_batch, target_position_id_batch = [], [] + + for sample in samples: + token, position_id, attention_mask = pad_batch( + sample["token"], sample["position_id"], sample["attention_mask"], length_to_pad + ) + token_batch.append(token) + position_id_batch.append(position_id) + attention_mask_batch.append(attention_mask) + context_length_batch.append(sample['context_length']) + target_position_id_batch.append(sample['target_position_id']) + return { + "tokens": torch.tensor(np.array(token_batch), dtype=torch.int64), + "position_ids": torch.tensor(np.array(position_id_batch), dtype=torch.int64), + "attention_mask": torch.tensor(np.array(attention_mask_batch), dtype=torch.int64) < 0.5, + "context_length": torch.tensor(context_length_batch, dtype=torch.int64), + "target_position_ids": torch.tensor(np.array(target_position_id_batch), dtype=torch.int64), + } + + @staticmethod + def build_generation_sample(text, max_gen_length, use_task_mask, unidirectional=True): + tokenizer = get_tokenizer() + + sop_id = tokenizer.get_command("sop") + mask_id = tokenizer.get_command("[gMASK]") if use_task_mask else tokenizer.get_command("[MASK]") + + token = np.array(text, dtype=np.int64) + + blank_filling = mask_id in text + if blank_filling: + assert not unidirectional, "Unidirectional attention doesn't support blank filling" + assert not use_task_mask, "Unidirectional attention doesn't support task mask" + mask_position = text.index(mask_id) + token = np.concatenate((token, [sop_id])) + else: + mask_position = len(token) + if unidirectional: + token = np.concatenate(([mask_id, sop_id], token)) + else: + token = np.concatenate((token, [mask_id, sop_id])) + context_length = len(token) + + position_id = np.arange(0, context_length, dtype=np.int64) + target_position_id = np.arange(context_length, context_length + max_gen_length, dtype=np.int64) + if not use_task_mask: + position_id[context_length - 1:] = mask_position + target_position_id[:] = mask_position + + attention_mask = np.tril(np.ones((context_length, context_length), dtype=np.int64)) + if not unidirectional: + attention_mask[: context_length - 1, : context_length - 1] = 1 + + item = { + "token": token, + "position_id": position_id, + "target_position_id": target_position_id, + "attention_mask": attention_mask, + "context_length": context_length, + } + return item + + def __getitem__(self, idx): + item = self.data[idx] + sample = self.build_generation_sample( + item["text"], + max_gen_length=self.config.max_gen_length, + use_task_mask=self.config.use_task_mask, + unidirectional=self.config.unidirectional, + ) + sample["targets"] = [np.array(target, dtype=self.dtype) for target in item["targets"]] + return sample + + +class MultiChoiceTaskDataset(EvaluationDataset): + config: MultiChoiceTaskConfig + + def __init__(self, path, config: MultiChoiceTaskConfig): + self.is_single_token = True # set to False later in process_single_item func + super().__init__(path, config) + + @property + def has_collate_fn(self) -> bool: + return True + + def collate_fn(self, samples): + TILE = 32 + length_to_pad = (max(map(lambda spl: len(spl["token"]), samples)) + TILE - 1) // TILE * TILE + + token_batch, position_id_batch, attention_mask_batch = [], [], [] + choices_batch, choice_target_ids_batch = [], [] + + for sample in samples: + token, position_id, attention_mask = pad_batch( + sample["token"], sample["position_id"], sample["attention_mask"], length_to_pad + ) + token_batch.append(token) + position_id_batch.append(position_id) + attention_mask_batch.append(attention_mask) + choices_batch.append(sample["choices"]) + choice_target_ids_batch.append(sample["choice_target_ids"]) + + return { + "tokens": torch.tensor(np.array(token_batch), dtype=torch.int64), + "position_ids": torch.tensor(np.array(position_id_batch), dtype=torch.int64), + "attention_mask": torch.tensor(np.array(attention_mask_batch), dtype=torch.int64) < 0.5, + "choices": choices_batch, + "choice_target_ids": choice_target_ids_batch, + "is_single_token": self.is_single_token, + } + + def process_single_item(self, item): + text, choices, label = get_tokenized_input(item, "inputs"), get_tokenized_input(item, "choices"), item["label"] + + tgt_seq_length = sum([len(choice) for choice in choices]) + if tgt_seq_length == len(choices): + # For single token, we only insert one [sop] + tgt_seq_length = 1 + + assert tgt_seq_length < self.config.max_seq_length + if len(text) + tgt_seq_length + 2 > self.config.max_seq_length: + text_length = self.config.max_seq_length - tgt_seq_length - 2 + text = text[len(text) - text_length : len(text)] + + assert not ( + self.mask_id in text and self.config.use_multitask_encoding + ), "Unified multitask encoding don't support blank filling" + + if tgt_seq_length != 1: + self.is_single_token = False + + return { + "text": text, + "choices": choices, + "label": label, + } + + @staticmethod + def build_multiple_choice_sample( + text, choices, is_single_token, unified_multitask_encoding=False, use_task_mask=False + ): + tokenizer = get_tokenizer() + + sop_id = tokenizer.get_command("sop") + mask_id = tokenizer.get_command("[gMASK]") if use_task_mask else tokenizer.get_command("[MASK]") + + token = np.array(text, dtype=np.int64) + target = np.array(text, dtype=np.int64) + position_id = np.arange(len(text), dtype=np.int64) + choice_target_id = [] + + blank_filling = mask_id in text + if not blank_filling: + mask_position = len(token) + token = np.concatenate((token, [mask_id])) + target = np.concatenate((target, [mask_id])) + position_id = np.concatenate((position_id, [mask_position])) + else: + mask_position = text.index(mask_id) + + division = len(token) + attention_mask = [np.ones((len(token), len(token)), dtype=np.int64)] + + for choice in choices: + if use_task_mask == False: + position_id = np.concatenate( + ( + position_id, + [mask_position] * len(choice) + if blank_filling or not unified_multitask_encoding + else np.arange(mask_position, mask_position + len(choice), dtype=np.int64), + ) + ) + else: + position_id = np.concatenate( + ( + position_id, + np.arange(division, division + len(choice), dtype=np.int64), + ) + ) + + choice_target_id.append(np.arange(len(token), len(token) + len(choice), dtype=np.int64)) + attention_mask.append(np.tril(np.ones((len(choice), len(choice)), dtype=np.int64))) + token = np.concatenate((token, [sop_id], choice[:-1])) + target = np.concatenate((target, choice)) + + if is_single_token: + break + + attention_mask = block_diag(*attention_mask) + attention_mask[: len(token), :division] = 1 + + if is_single_token: + choices = np.array(choices, dtype=np.int64).squeeze().tolist() + + item = { + "token": token, + "position_id": position_id, + "attention_mask": attention_mask, + "choices": choices, + "choice_target_ids": choice_target_id[0] if is_single_token else choice_target_id, + } + return item + + def __getitem__(self, idx): + item = self.data[idx] + sample = self.build_multiple_choice_sample( + item["text"], + item["choices"], + is_single_token=self.is_single_token, + unified_multitask_encoding=self.config.use_multitask_encoding, + use_task_mask=self.config.use_task_mask, + ) + sample["label"] = item["label"] + return sample + + +class LanguageModelTaskDataset(EvaluationDataset): + config: LanguageModelTaskConfig + left_weights: List[int] + weights: List[int] + + def process_single_file(self, path): + num_sequences = [] + with open(os.path.join(path), "r", encoding="utf-8") as file: + raw_text = file.read() + tokens = self.tokenizer.tokenize(raw_text) + self.data.append( + { + "raw_text": tokens, + "num_original_tokens": len(raw_text.strip().split(" ")), + "num_sequences": max( + math.ceil( + max(len(tokens) - (self.config.max_seq_length - 1), 0) / self.config.generation_length + ) + + 1, + 1, + ), + } + ) + num_sequences.append(self.data[-1]["num_sequences"]) + self.weights = list(accumulate(num_sequences)) + self.left_weights = [0] + self.weights[:-1] + + def process_single_item(self, item): + pass + + def __len__(self): + return self.data[0]["num_sequences"] + + def __getitem__(self, idx): + document_idx = bisect_right(self.weights, idx) + idx = idx - self.left_weights[document_idx] + start_idx = idx * self.config.generation_length + end_idx = start_idx + self.config.max_seq_length - 1 # for additional [gMASK] + tokens = self.data[document_idx]["raw_text"][start_idx:end_idx] + + mask_id = self.gmask_id if self.config.use_task_mask else self.mask_id + sop_id = self.tokenizer.get_command("sop") + + if idx == 0 or self.config.unidirectional: + prompt, text = [], tokens + else: + prompt_length = self.config.max_seq_length - 1 - self.config.generation_length + prompt, text = tokens[:prompt_length], tokens[prompt_length:] + + seq_length = len(prompt) + len(text) + 1 + attention_mask = np.tril(np.ones((seq_length, seq_length), dtype=np.int64)) + attention_mask[: len(prompt) + 1, : len(prompt) + 1] = 1 + + return { + "tokens": np.array(prompt + [mask_id, sop_id] + text[:-1], dtype=np.int64), + "targets": np.array(prompt + [mask_id] + text, dtype=np.int64), + "position_ids": np.arange(0, seq_length, dtype=np.int64), + "attention_mask": attention_mask < 0.5, + "loss_masks": np.array([0] * (len(prompt) + 1) + [1] * len(text), dtype=np.int64), + } diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/metrics.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..af1b05b17f5fe146b165c5db8f029e6a166a1310 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/metrics.py @@ -0,0 +1,140 @@ +import re +import math +import string +import functools + +import torch +import numpy as np + +from typing import Tuple, List +from collections import Counter +from collections import defaultdict +from SwissArmyTransformer import get_tokenizer + +from .utils import print_rank_0 + + +def accuracy_metric(predictions, examples): + count = 0 + num_predictions = max(len(predictions), 1) + assert len(predictions) == len(examples) + for prediction, example in zip(predictions, examples): + count += prediction == example["label"] + return count * 100.0 / num_predictions + + +def F1_metric(predictions, examples): + assert len(predictions) == len(examples) + from sklearn.metrics import f1_score + + truth = [] + for prediction, example in zip(predictions, examples): + truth.append(example["label"]) + return f1_score(truth, predictions, average="micro") * 100.0 + + +def precision_metric(predictions, examples): + assert len(predictions) == len(examples) + from sklearn.metrics import precision_score + + truth = [] + for prediction, example in zip(predictions, examples): + truth.append(example["label"]) + return precision_score(truth, predictions, average="micro") * 100.0 + + +def recall_metric(predictions, examples): + assert len(predictions) == len(examples) + from sklearn.metrics import recall_score + + truth = [] + for prediction, example in zip(predictions, examples): + truth.append(example["label"]) + return recall_score(truth, predictions, average="micro") * 100.0 + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + return re.sub(r"\b(a|an|the)\b", " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def f1_score(prediction, ground_truth): + prediction_tokens = normalize_answer(prediction).split() + ground_truth_tokens = normalize_answer(ground_truth).split() + common = Counter(prediction_tokens) & Counter(ground_truth_tokens) + num_same = sum(common.values()) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def exact_match_score(prediction, ground_truth): + return normalize_answer(prediction) == normalize_answer(ground_truth) + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + if not ground_truths: + return 0.0 + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def qa_evaluate(predictions, examples, metric): + assert len(examples) == len(predictions) + tokenizer = get_tokenizer() + + score = 0.0 + for example, prediction in zip(examples, predictions): + ground_truths = [tokenizer.tokenizer.decode(target) for target in example["targets"]] + prediction = tokenizer.tokenizer.decode(prediction) + if ground_truths: + score += metric_max_over_ground_truths(metric, prediction, ground_truths) + score = 100.0 * score / len(predictions) + return score + + +qa_exact_match = functools.partial(qa_evaluate, metric=exact_match_score) +qa_f1 = functools.partial(qa_evaluate, metric=f1_score) + + +def calculate_perplexity(loss: List[float], data): + return math.exp(min(20, np.sum(loss) / data[0]["num_original_tokens"])) + + +def special_for_dataset(predictions, examples): + print_rank_0("Metrics not found, maybe dataset special metric or metric name error") + return True + + +DEFAULT_METRICS = defaultdict(lambda: special_for_dataset) +DEFAULT_METRICS.update( + { + "EM": qa_exact_match, + "F1": qa_f1, + "Accuracy": accuracy_metric, + "PPL": calculate_perplexity, + "Precision": precision_metric, + "Recall": recall_metric, + "F1_mul": F1_metric, + } +) diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/model.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/model.py new file mode 100644 index 0000000000000000000000000000000000000000..b8a239636e86faee583539364ab7a3c8f89c87de --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/model.py @@ -0,0 +1,210 @@ +import torch + +from typing import List, Union + +from SwissArmyTransformer.generation.autoregressive_sampling import update_mems, get_masks_and_position_ids_default +from SwissArmyTransformer.mpu import vocab_parallel_cross_entropy + + +def print_rank_0(*args, **kwargs): + if torch.distributed.get_rank() == 0: + print(*args, **kwargs) + + +def batch_filling_sequence( + model, + seqs, + context_lengths, + strategy, + max_memory_length=100000, + get_masks_and_position_ids=get_masks_and_position_ids_default, + mems=None, + **kw_args + ): + ''' + seq: [2, 3, 5, ..., -1(to be generated), -1, ...] + mems: [num_layers, batch_size, len_mems(index), mem_hidden_size] + cache, should be first mems.shape[1] parts of context_tokens. + mems are the first-level citizens here, but we don't assume what is memorized. + input mems are used when multi-phase generation. + ''' + assert len(seqs.shape) == 2 + + # building the initial tokens, attention_mask, and position_ids + batch_size, context_length = seqs.shape + # seqs: [1, max_gen_len]; attention_mask: [1, 1, max_gen_len, max_gen_len]; position_ids: [1, max_seq_len] + seqs, attention_mask, position_ids = get_masks_and_position_ids(seqs) + # tokens: [1, context_len] + tokens = seqs[..., :context_length] + if attention_mask.dtype != torch.bool: + attention_mask = attention_mask.type_as(next(model.parameters())) # if fp16 + # initialize generation + counter = context_length - 1 # Last fixed index is ``counter'' + index = 0 if mems is None else mems.shape[2] # Next forward starting index, also the length of cache. + num_beams = 1 + # step-by-step generation + token_num = 0 + while counter < seqs.shape[1] - 1: + # Now, we want to generate seq[counter + 1], + # token[:, index: counter+1] needs forwarding. + # forward + token_num += 1 + # first: index=0; other: index=counter + tokens = tokens.reshape(batch_size * num_beams, -1) + mems = mems.reshape(mems.shape[0], batch_size * num_beams, mems.shape[-2], mems.shape[-1]) if mems is not None else None + logits, *output_per_layers = model( + tokens[:, index:], # first: [1, 0:counter]; other: [1, counter:counter+1] + position_ids[..., index: counter+1], # first: [1, 0:counter+1]; other: [1, counter:counter+1] + attention_mask[..., index: counter+1, :counter+1], # TODO memlen # first: [1,1,0:counter+1,0:counter+1]; other: [1,1,counter:counter+1,0:counter+1] + mems=mems, + **kw_args + ) + mem_kv = [o['mem_kv'] for o in output_per_layers] + mems = update_mems(mem_kv, mems, max_memory_length=max_memory_length) + if counter == context_length - 1: + logits = logits[torch.arange(batch_size), context_lengths - 1] + else: + logits = logits[:, -1] + counter += 1 + index = counter + # if torch.distributed.get_rank() == 0: + # print(f"counter: {counter}: logits: {logits.float().abs().mean()}") + # sampling + logits = logits.reshape(batch_size, num_beams, -1) + tokens = tokens.reshape(batch_size, num_beams, -1) + mems = mems.reshape(mems.shape[0], batch_size, num_beams, mems.shape[-2], mems.shape[-1]) + tokens, mems = strategy.forward(logits, tokens, mems) + if len(tokens.shape) == 3 and num_beams == 1: + num_beams = tokens.shape[1] + position_ids = position_ids.unsqueeze(1).expand(batch_size, num_beams, -1).reshape(batch_size * num_beams, -1) + attention_mask_shape = attention_mask.shape[-3:] + attention_mask = attention_mask.unsqueeze(1).expand(batch_size, num_beams, -1, -1, -1).reshape( + batch_size * num_beams, *attention_mask_shape) + if strategy.is_done: + break + print_rank_0("token num==========================: " + str(token_num)) + return strategy.finalize(tokens, mems) + + +class ModelForEvaluation(torch.nn.Module): + def __init__(self, model): + super().__init__() + + self.model = model + self.device = next(self.model.parameters()).device + + @staticmethod + def process_data(batch, device): + return ( + batch["tokens"].to(device=device).long(), + batch["position_ids"].to(device=device).long(), + batch["attention_mask"].to(device=device).bool().unsqueeze(1), + ) + + def cond_log_prob(self, batch) -> List[List[float]]: + """ + @return: Conditional log probability of each option + """ + tokens, position_ids, attention_mask = self.process_data(batch, self.device) + choices_batch, choice_target_ids_batch = batch["choices"], batch["choice_target_ids"] + is_single_token = batch["is_single_token"] + + self.model.eval() + with torch.no_grad(): + logits, *output_per_layers = self.model(tokens, position_ids, attention_mask, log_attention_weights=None) + logits_batch = torch.nn.functional.log_softmax(logits, dim=-1) + + # output: [b, sq, vocab] + log_probs = [] + + if is_single_token: # Single token + for logits, choices, choice_target_ids in zip(logits_batch, choices_batch, choice_target_ids_batch): + log_probs.append(logits[choice_target_ids[0], choices].tolist()) + else: # Multi token + for output, choices, choice_target_ids in zip(logits_batch, choices_batch, choice_target_ids_batch): + log_probs_single = [] + for choice, choice_target_id in zip(choices, choice_target_ids): + tmp = output[choice_target_id, choice] + log_probs_single.append(tmp.sum().tolist()) + log_probs.append(log_probs_single) + return log_probs + + def generate_text(self, sample, strategy, return_all_beams=False) -> Union[ + List[List[int]], List[List[List[int]]]]: + """ + @return: A list of text model generated, sorted by score in descending order + """ + + seqs = sample["tokens"].to(device=self.device).long() + context_lengths = sample["context_length"].long() + + def get_masks_and_position_ids(seq): + batch_size = seq.shape[0] + max_gen_length = sample['target_position_ids'].shape[-1] + tokens = torch.nn.functional.pad(seq, (0, max_gen_length), mode='constant', value=-1) + position_ids = torch.cat((sample['position_ids'], sample['target_position_ids']), dim=-1) + position_ids = position_ids.to(device=self.device).long() + attention_mask = sample["attention_mask"].to(device=self.device) + context_mask = attention_mask[torch.arange(batch_size), context_lengths - 1].unsqueeze(1).repeat(1, + max_gen_length, + 1) + causal_mask = torch.tril(context_mask.new_ones((batch_size, max_gen_length, max_gen_length))) < 0.5 + generation_mask = torch.cat( + (context_mask, causal_mask), dim=-1) + attention_mask = torch.nn.functional.pad(attention_mask, (0, max_gen_length), mode='constant', value=1) + attention_mask = torch.cat((attention_mask, generation_mask), dim=1) + attention_mask = attention_mask.bool().unsqueeze(1) + return tokens, attention_mask, position_ids + + self.model.eval() + with torch.no_grad(): + output = batch_filling_sequence( + self.model, + seqs, + context_lengths, + get_masks_and_position_ids=get_masks_and_position_ids, + strategy=strategy, + )[0] + + if isinstance(output, torch.Tensor): # different strategies + output = output.tolist() + + output_targets = [] + context_length = seqs.shape[1] + for lines in output: + lines = lines.tolist() if isinstance(lines, torch.Tensor) else lines + output_target = [] + if not isinstance(lines, list): + lines = [lines] + for line in lines: + unfinished = line.index(-1) if -1 in line else len(line) + if line[unfinished - 1] in strategy.end_tokens: + unfinished -= 1 + line = line[context_length:unfinished] + output_target.append(line) + if not return_all_beams: + output_targets.append(output_target[0]) + else: + output_targets.append(output_target) + return output_targets + + + def calculate_loss(self, batch) -> List[float]: + tokens, position_ids, attention_mask = self.process_data(batch, self.device) + targets, loss_masks = ( + batch["targets"].to(device=self.device).long(), + batch["loss_masks"].to(device=self.device).long(), + ) + + original_parallel_output = self.model.transformer.parallel_output + self.model.transformer.parallel_output = True + self.model.eval() + + with torch.no_grad(): + logits, *output_per_layers = self.model(tokens, position_ids, attention_mask, log_attention_weights=None) + losses = vocab_parallel_cross_entropy(logits.contiguous().float(), targets) + loss = torch.sum(losses * loss_masks, dim=-1) + + self.model.transformer.parallel_output = original_parallel_output + + return loss.tolist() diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/tasks.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..cdba83bd15c2113bb0db6251234ad9c9de4ee8c4 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/tasks.py @@ -0,0 +1,220 @@ +import torch +import time +import numpy as np +import torch.distributed as dist + +from typing import Dict, Callable, Type, Tuple, List, Any +from abc import ABC, abstractmethod +from glob import glob +from os.path import join, relpath +from collections import defaultdict + +from SwissArmyTransformer.tokenization.icetk_glm_130B.ice_tokenizer import _IceTokenizer + +from generation import BaseStrategy, BeamSearchStrategy +from .configs import BaseConfig, GenerationTaskConfig, MultiChoiceTaskConfig, LanguageModelTaskConfig +from .model import ModelForEvaluation +from .dataset import EvaluationDataset, GenerationTaskDataset, MultiChoiceTaskDataset, LanguageModelTaskDataset +from .utils import build_data_loader, gather_result, print_rank_0 +from .metrics import DEFAULT_METRICS + + +class BaseTask(ABC): + model: ModelForEvaluation + tokenizer: _IceTokenizer + config: BaseConfig + file_groups: Dict[str, List[str]] + + @classmethod + def config_class(cls) -> Type[BaseConfig]: + return BaseConfig + + @property + def metrics(self) -> Dict[str, Callable]: + return {metric: DEFAULT_METRICS[metric] for metric in self.config.metrics} + + def __init__(self, model: ModelForEvaluation, tokenizer: _IceTokenizer, config: BaseConfig): + self.model = model + self.tokenizer = tokenizer + self.config = config + self.config.metrics = list(self.metrics.keys()) + + self.file_groups = self.get_file_groups() + self.verbose = dist.get_rank() == 0 + + def get_file_groups(self): + pattern_group = {} + if isinstance(self.config.file_pattern, str): + pattern_group["all"] = self.config.file_pattern + else: + pattern_group = self.config.file_pattern + return { + name: [ + relpath(path, start=self.config.path) + for path in sorted(glob(join(self.config.path, pattern), recursive=True)) + ] + for name, pattern in pattern_group.items() + } + + def evaluate(self): + dist.barrier() + start = time.time() + print_rank_0("\n") + print_rank_0(f"{self.config}") + print_rank_0(f"Evaluating task {self.config.name}:") + + result_dict_all = {} + + for group_name, filelist in self.file_groups.items(): + print_rank_0(f" Evaluating group {group_name}:") + + result_dict_group = {} + for file in filelist: + dataset = self.build_dataset(file) + dataloader = build_data_loader( + dataset, + micro_batch_size=self.config.micro_batch_size, + num_workers=1, + drop_last=False, + collate_fn=dataset.collate_fn if dataset.has_collate_fn else None, + ) + + prediction = [] + with torch.no_grad(): + for _, batch in enumerate(dataloader): + prediction.append(self.predict_single_batch(batch)) + + prediction = gather_result(prediction, len(dataset), self.config.micro_batch_size) + result_dict = {key: metric(prediction, dataset.data) for key, metric in self.metrics.items()} + result_dict_group[file] = (result_dict, len(dataset)) + + if self.verbose: + self.report_single_metrics(file, result_dict) + + result_dict_all[group_name] = result_dict_group + + print_rank_0(f"Evaluation results of task {self.config.name}:") + + if self.verbose: + for group_name, result_dict_group in result_dict_all.items(): + self.report_group_metrics(group_name, result_dict_group) + self.report_overall_metrics( + {k: v for result_dict_group in result_dict_all.values() for k, v in result_dict_group.items()}, + ) + + print_rank_0(f"Finish task {self.config.name} in {time.time() - start:.1f}s.") + + def report_single_metrics(self, file: str, result_dict: Dict[str, float]): + output_str = f" Finish {file}" + for key, value in result_dict.items(): + output_str += f", {key} = {value:.3f}" + print_rank_0(output_str) + + @staticmethod + def calc_group_metrics(result_dict_group: Dict[str, Tuple[Dict[str, float], int]]): + metrics_dict = defaultdict(lambda: []) + weight = [] + for file, (result_dict, length) in result_dict_group.items(): + for key, value in result_dict.items(): + metrics_dict[key].append(value) + weight.append(length) + return { + name: { + "max": np.max(value), + "median": np.median(value), + "average": np.average(value, weights=weight), + } + for name, value in metrics_dict.items() + } + + def report_group_metrics(self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, float], int]], level=1): + stats_dict = self.calc_group_metrics(result_dict_group) + if len(stats_dict) == 1: + name, stats = next(iter(stats_dict.items())) + print_rank_0( + " " * level + f"Group {group_name} {name}: max = {stats['max']:.3f}, " + f"median = {stats['median']:.3f}, average = {stats['average']:.3f}" + ) + else: + print_rank_0(" " * level + f" Group {group_name}: ") + for name, stats in stats_dict.items(): + print( + " " * (level + 1) + f"Metric {name}: max = {stats['max']:.3f}, " + f"median = {stats['median']:.3f}, average = {stats['average']:.3f}" + ) + + def report_overall_metrics(self, result_dict_all: Dict[str, Tuple[Dict[str, float], int]]): + pass + + @abstractmethod + def predict_single_batch(self, batch) -> List[Any]: + pass + + @abstractmethod + def build_dataset(self, relative_path: str) -> EvaluationDataset: + pass + + +class GenerationTask(BaseTask, ABC): + config: GenerationTaskConfig + + @classmethod + def config_class(cls): + return GenerationTaskConfig + + def build_dataset(self, relative_path): + return GenerationTaskDataset(join(self.config.path, relative_path), self.config) + + def __init__(self, model: ModelForEvaluation, tokenizer: _IceTokenizer, config: GenerationTaskConfig): + super(GenerationTask, self).__init__(model, tokenizer, config) + + end_tokens = [tokenizer.get_command("eop"), tokenizer.get_command("eos")] + if self.config.sampling_strategy == "BaseStrategy": + self.strategy = BaseStrategy(batch_size=self.config.micro_batch_size, temperature=1.0, top_k=1, + end_tokens=end_tokens) + elif self.config.sampling_strategy == "BeamSearchStrategy": + self.strategy = BeamSearchStrategy( + self.config.micro_batch_size, + self.config.num_beams, + length_penalty=self.config.length_penalty, + consider_end=True, + end_tokens=end_tokens, + no_repeat_ngram_size=self.config.no_repeat_ngram_size, + min_gen_length=self.config.min_gen_length, + deterministic=True, # For evaluation, we need a determined generation strategy + ) + else: + raise ValueError(f"unknown strategy {self.config.sampling_strategy}") + + def predict_single_batch(self, batch) -> List[List[int]]: + output = self.model.generate_text(batch, self.strategy, return_all_beams=False) + return output + + +class MultiChoiceTask(BaseTask, ABC): + config: MultiChoiceTaskConfig + + @classmethod + def config_class(cls): + return MultiChoiceTaskConfig + + def build_dataset(self, relative_path): + return MultiChoiceTaskDataset(join(self.config.path, relative_path), self.config) + + def predict_single_batch(self, batch) -> List[int]: + log_probs = self.model.cond_log_prob(batch) + return [np.argmax(log_probs_single).item() for log_probs_single in log_probs] + + +class LanguageModelTask(BaseTask, ABC): + config: LanguageModelTaskConfig + + @classmethod + def config_class(cls): + return LanguageModelTaskConfig + + def build_dataset(self, relative_path): + return LanguageModelTaskDataset(join(self.config.path, relative_path), self.config) + + def predict_single_batch(self, batch) -> List[float]: + return self.model.calculate_loss(batch) diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/utils.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a1be20638b9feeba3d1599eb798d6bfa77fe5a8c --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/evaluation/utils.py @@ -0,0 +1,67 @@ +import torch +import torch.distributed as dist + +from SwissArmyTransformer import mpu, get_tokenizer + + +def print_rank_0(*args, **kwargs): + if torch.distributed.get_rank() == 0: + print(*args, **kwargs) + + +def build_data_loader(dataset, micro_batch_size, num_workers, drop_last, collate_fn=None): + # Sampler. + world_size = mpu.get_data_parallel_world_size() + rank = mpu.get_data_parallel_rank() + sampler = torch.utils.data.distributed.DistributedSampler( + dataset, num_replicas=world_size, rank=rank, shuffle=False + ) + + # Data loader. Note that batch size is the per GPU batch size. + data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=micro_batch_size, + sampler=sampler, + shuffle=False, + num_workers=num_workers, + drop_last=drop_last, + pin_memory=True, + collate_fn=collate_fn, + ) + + return data_loader + + +def gather_result(prediction, total_length, micro_batch_size): + """ + @param prediction: Local predictions with order defined by distributed sampler + @param total_length: Total sample num + @return: [sample_0, sample_1, ..., sample_{total_length-1}] + """ + torch.cuda.empty_cache() + world_size = mpu.get_data_parallel_world_size() + prediction_gathered = [None for _ in range(world_size)] + dist.all_gather_object(prediction_gathered, prediction, group=mpu.get_data_parallel_group()) + prediction = [] + for i in range(len(prediction_gathered[0])): + for j in range(micro_batch_size): + for k in range(world_size): + if j < len(prediction_gathered[k][i]): + prediction.append(prediction_gathered[k][i][j]) + prediction = prediction[:total_length] + return prediction + + +def get_tokenized_input(item, key): + if key in item: + return item[key] + tokenizer = get_tokenizer() + pretokenized_key = key + "_pretokenized" + assert pretokenized_key in item + if isinstance(item[pretokenized_key], list): + result = [] + for raw in item[pretokenized_key]: + result.append(tokenizer.tokenize(raw)) + return result + else: + return tokenizer.tokenize(item[pretokenized_key]) diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/generate.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..59c4667dfd314edb644aa9afd06cdb91e70113eb --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/generate.py @@ -0,0 +1,220 @@ +import os +import torch +import stat +import re +from torch_npu.contrib import transfer_to_npu + +from functools import partial +from typing import List, Tuple + +from SwissArmyTransformer import mpu +from evaluation.model import batch_filling_sequence +from generation import BeamSearchStrategy, BaseStrategy +from SwissArmyTransformer.generation.utils import timed_name, generate_continually +from initialize import initialize, initialize_model_and_tokenizer + + +def add_generation_specific_args(parser): + parser.add_argument("--sampling-strategy", type=str, default="BaseStrategy", help="Type of sampling strategy.") + parser.add_argument("--min-gen-length", type=int, default=0, help="The minimum length each blank should generate.") + parser.add_argument( + "--print-all-beams", action="store_true", help="Print all output generated by beam search strategy." + ) + + +def isEnglish(s): + try: + s.encode(encoding="utf-8").decode("ascii") + except UnicodeDecodeError: + return False + else: + return True + + +def get_masks_and_position_ids(seq, mask_position, max_gen_length, gmask=False): + context_length = seq.shape[1] + # [1, max_gen_len] + tokens = torch.nn.functional.pad(seq, (0, max_gen_length), mode="constant", value=-1) + # [1, max_gen_len, max_gen_len] + attention_mask = torch.ones((1, tokens.shape[-1], tokens.shape[-1]), device=tokens.device) + attention_mask.tril_() + attention_mask[..., : context_length - 1] = 1 + # [1, 1, max_gen_len, max_gen_len] + attention_mask.unsqueeze_(1) + attention_mask = (attention_mask < 0.5).bool() + + # [1, max_seq_len] + position_ids = torch.arange(tokens.shape[-1], dtype=torch.long, device=tokens.device) + if not gmask: + position_ids[context_length - 1 :] = mask_position + + position_ids = position_ids.unsqueeze(0) + + return tokens, attention_mask, position_ids + + +def fill_blanks(raw_text: str, model, tokenizer, strategy) -> Tuple[List[str], List[str], List[List[str]]]: + # add MASK + generation_mask = "[gMASK]" + if "[MASK]" in raw_text: + generation_mask = "[MASK]" + elif "[sMASK]" in raw_text: + generation_mask = "[sMASK]" + use_gmask = "[MASK]" not in raw_text and "[sMASK]" not in raw_text + + mask_pattern = r"\[[sg]?MASK\]" + text_list = re.split(mask_pattern, raw_text) + pattern_list = re.compile(mask_pattern).findall(raw_text) + seq = [] + for i in range(len(pattern_list)): + pattern = pattern_list[i] + sub_text = text_list[i] + seq.extend(tokenizer.tokenize(sub_text)) + seq.append(tokenizer.get_command(pattern)) + + seq.extend(tokenizer.tokenize(text_list[-1])) + + if "MASK]" not in raw_text: + seq += [tokenizer.get_command(generation_mask)] + raw_text += " " + generation_mask + if not raw_text.endswith("MASK]"): + seq = seq + [tokenizer.get_command("eos")] + if mpu.get_model_parallel_rank() == 0: + print("\nInput: {}\n".format(raw_text)) + if len(seq) > args.max_sequence_length: + raise ValueError("text too long.") + + # generation + is_english = isEnglish(raw_text) + output_list = [seq] + num_output = args.num_beams if args.sampling_strategy == "BeamSearchStrategy" else 1 + last_pos, answers, answers_with_style, blanks = ( + [0] * num_output, + ["" for _ in range(num_output)], + ["" for _ in range(num_output)], + [[] for _ in range(num_output)], + ) + + # continually detect the first mark position + while True: + seq = output_list[0] + # detect mask position + mask_token = tokenizer.get_command(generation_mask) + if mask_token not in seq: + break + mask_position = seq.index(mask_token) + + output_list = [] + + input_seq = torch.cuda.LongTensor( + [seq + [tokenizer.get_command("sop")]], + device=args.device, + ) + output, _ = batch_filling_sequence( + model, + input_seq, + torch.cuda.LongTensor([input_seq.shape[-1]], device=args.device), + strategy=strategy, + get_masks_and_position_ids=partial( + get_masks_and_position_ids, + mask_position=mask_position, + max_gen_length=args.out_seq_length - input_seq.shape[-1], + gmask=use_gmask, + ), + ) + if isinstance(output, torch.Tensor): # different strategies + output = output.tolist() + output = output[0] # batch_size = 1 + output_list.extend(output) + + # clip -1s and fill back generated things into seq + for i in range(len(output_list)): + output = output_list[i].tolist() if isinstance(output_list[i], torch.Tensor) else output_list[i] + try: + unfinished = output.index(-1) + except ValueError: + unfinished = len(output) + if output[unfinished - 1] in strategy.end_tokens: + unfinished -= 1 + bog = output.index(tokenizer.get_command("sop")) + + prefix = tokenizer.detokenize(output[last_pos[i] : mask_position]) + blank = tokenizer.detokenize(output[bog + 1 : unfinished]) + answers_with_style[i] += ( + prefix + + (" " if is_english else "") + + ("\033[4m" if use_gmask else "\x1b[0;32m\033[4m") + + blank + + ("\033[0m" if use_gmask else "\033[0m\x1b[0m") + + (" " if is_english else "") + ) + blanks[i].append(blank) + last_pos[i] = mask_position + unfinished - (bog + 1) + output_list[i] = output[:mask_position] + output[bog + 1 : unfinished] + output[mask_position + 1 : bog] + + for i, output in enumerate(output_list): + if output[-1] == tokenizer.get_command("eos"): + output = output[:-1] + answers_with_style[i] += tokenizer.detokenize(output[last_pos[i] :]) + answers[i] = tokenizer.detokenize(output) + + return answers, answers_with_style, blanks + + +def main(args): + model, tokenizer = initialize_model_and_tokenizer(args) + + end_tokens = [tokenizer.get_command("eop"), tokenizer.get_command("eos")] + + if args.sampling_strategy == "BaseStrategy": + strategy = BaseStrategy( + batch_size=1, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, end_tokens=end_tokens + ) + elif args.sampling_strategy == "BeamSearchStrategy": + strategy = BeamSearchStrategy( + 1, + args.num_beams, + length_penalty=args.length_penalty, + consider_end=True, + end_tokens=end_tokens, + no_repeat_ngram_size=args.no_repeat_ngram_size, + min_gen_length=args.min_gen_length, + ) + else: + raise ValueError(f"unknown strategy {args.sampling_strategy}") + + def process(raw_text): + if args.with_id: + query_id, raw_text = raw_text.split("\t") + + answers, answers_with_style, blanks = fill_blanks(raw_text, model, tokenizer, strategy) + + # save + if args.with_id: + full_path = os.path.join(args.output_path, query_id + ".txt") + else: + prefix = raw_text.replace("/", "")[:20] + full_path = timed_name(prefix, ".txt", args.output_path) + if mpu.get_model_parallel_rank() == 0: + if args.print_all_beams and len(answers) > 1: + for idx, answer_with_style in enumerate(answers_with_style): + print(f"Output beam {idx}:", answer_with_style) # print the first. + if len(answer_with_style) > 120: + print("") + else: + print(f"Output:", answers_with_style[0]) # print the first. + with open(full_path, "w", encoding="utf-8") as fout: + for answer in answers: + fout.write(answer + "\n") + + os.chmod(full_path, stat.S_IRWXO + stat.S_IRWXG + stat.S_IRWXU) + + os.makedirs(args.output_path, exist_ok=True) + generate_continually(process, args.input_source) + + +if __name__ == "__main__": + args = initialize(extra_args_provider=add_generation_specific_args) + + with torch.no_grad(): + main(args) diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/generation/__init__.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/generation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..540346c6456a634ebd73a9082dae5ba08ec5c8fe --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/generation/__init__.py @@ -0,0 +1 @@ +from .strategies import BaseStrategy, BeamSearchStrategy diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/generation/strategies.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/generation/strategies.py new file mode 100644 index 0000000000000000000000000000000000000000..92525bdb87d13f55c06ef8b14bdc55383b6f1e48 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/generation/strategies.py @@ -0,0 +1,193 @@ +import numpy as np +import torch +import torch.nn.functional as F +from SwissArmyTransformer.generation.sampling_strategies.base_strategy import top_k_logits + +class BaseStrategy: + def __init__(self, batch_size, invalid_slices=[], temperature=1., top_k=200, eps=1e-4, top_p=0.0, end_tokens=None): + self.batch_size = batch_size + self.invalid_slices = invalid_slices + self.temperature = temperature + self.topk = top_k + self.top_p = top_p + self.eps = eps + if end_tokens is None: + end_tokens = [] + self.end_tokens = end_tokens + self._is_done = np.zeros(self.batch_size, dtype=np.bool) + + @property + def is_done(self) -> bool: + return self._is_done.all() + + def forward(self, logits, tokens, mems, temperature=None): + logits = logits.view(-1, logits.size(-1)) + batch_size = tokens.shape[0] + if temperature is None: + temperature = self.temperature + logits = logits / temperature + for invalid_slice in self.invalid_slices: + logits[..., invalid_slice] = -65504 + + logits = top_k_logits(logits, self.topk, self.top_p) + probs = F.softmax(logits.float(), dim=-1) # float is essetial, due to a bug in Pytorch + pred = torch.multinomial(probs, num_samples=1) + for i in range(self.batch_size): + if i >= batch_size: + self._is_done[i] = True + elif self._is_done[i]: + pred[i] = -1 + elif pred[i].item() in self.end_tokens: + self._is_done[i] = True + tokens = torch.cat((tokens, pred.view(tokens.shape[:-1] + (1,))), dim=-1) + return tokens, mems + + def finalize(self, tokens, mems): + self._is_done = np.zeros(self.batch_size, dtype=np.bool) + return tokens, mems + + +class BeamSearchStrategy: + def __init__( + self, + batch_size, + num_beams, + length_penalty=1.0, + consider_end=False, + end_tokens=[], + invalid_slices=[], + no_repeat_ngram_size=0, + min_gen_length=0, + deterministic=False, + ): + self.batch_size = batch_size + self.num_beams = num_beams + self.length_penalty = length_penalty + self.end_tokens = end_tokens + self.ngram = no_repeat_ngram_size + self.min_gen_length = min_gen_length + self.invalid_slices = invalid_slices + self.consider_end = consider_end + self.deterministic = deterministic + self._init_cache() + + def _init_cache(self): + self.end_beams = [[] for _ in range(self.batch_size)] # list of LongTensors + self.end_beams_penalized_scores = [[] for _ in range(self.batch_size)] # list of LongTensors + self.cached_beam_scores = 0 # [batch_size] + self.cached_beam_ngram_bans = [[{} for _ in range(self.num_beams)] for _ in range(self.batch_size)] + self.length_generated = 0 + self._is_done = np.zeros(self.batch_size, dtype=np.bool) + + def _add_end_beams(self, score, beam, batch_idx): + score = score / ((5.0 + len(beam)) / 6) ** self.length_penalty # Magic number for OpenNMT + for i in range(len(self.end_beams[batch_idx]), -1, -1): + if i == 0 or score < self.end_beams_penalized_scores[batch_idx][i - 1]: + break + self.end_beams[batch_idx].insert(i, beam) + self.end_beams_penalized_scores[batch_idx].insert(i, score) + + self.end_beams[batch_idx] = self.end_beams[batch_idx][: self.num_beams] + self.end_beams_penalized_scores[batch_idx] = self.end_beams_penalized_scores[batch_idx][: self.num_beams] + + @property + def is_done(self) -> bool: + return self._is_done.all() + + def forward(self, logits, tokens, mems): + batch_size, num_beams, vocab_size = logits.shape + seq_len = tokens.shape[-1] + logits = logits.float() + for invalid_slice in self.invalid_slices: + logits[..., invalid_slice] = -65504 + if self.min_gen_length > self.length_generated: + for end_token in self.end_tokens: + logits[..., end_token] = -65504 + if self.ngram > 0 and seq_len > self.ngram: + for batch_idx in range(batch_size): + for i in range(num_beams): + ngram_prefix = tokens[batch_idx, i, -(self.ngram - 1) :].tolist() # TODO ngram=1 + for banned_index in self.cached_beam_ngram_bans[batch_idx][i].get(tuple(ngram_prefix), []): + logits[batch_idx, i, banned_index] = -65504 + + next_token_scores = F.log_softmax(logits, dim=-1) # [batch_size, vocab_size] + prev_scores = self.cached_beam_scores + if isinstance(prev_scores, torch.Tensor): + prev_scores = prev_scores[..., None].expand_as(next_token_scores) + next_token_scores = next_token_scores + prev_scores + + next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) + + probs = F.softmax(next_token_scores, dim=-1) + if num_beams < self.num_beams: # First token + probs = probs[..., :vocab_size] + if self.deterministic: + next_tokens = torch.topk(probs, k=(max(1, len(self.end_tokens)) + 1) * self.num_beams).indices # [2*nb] + else: + next_tokens = torch.multinomial( + probs, num_samples=(max(1, len(self.end_tokens)) + 1) * self.num_beams + ) # [2*nb] + next_token_scores = next_token_scores[torch.arange(batch_size).unsqueeze(1), next_tokens] + next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1) + next_tokens = next_tokens[torch.arange(batch_size).unsqueeze(1), _indices] + + next_indices = torch.div(next_tokens, vocab_size, rounding_mode="trunc") + next_tokens = next_tokens % vocab_size + + # select out end beams or continue beams + beam_continue_batch, score_continue_batch, mems_continue_batch = [], [], [] + for batch_idx in range(batch_size): + beam_continue = [] + scores_continue = [] + bans_continue = [] + mems_contiue = [] + for i in range(len(next_tokens[batch_idx])): + beam = torch.cat((tokens[batch_idx, next_indices[batch_idx, i]], next_tokens[batch_idx, i : i + 1])) + if not self._is_done[batch_idx] and int(next_tokens[batch_idx, i]) in self.end_tokens: + self._add_end_beams(next_token_scores[batch_idx, i], beam, batch_idx) + elif len(beam_continue) < self.num_beams: + beam_continue.append(beam) + mems_contiue.append(mems[:, batch_idx, next_indices[batch_idx, i]]) + # update caches + scores_continue.append(next_token_scores[batch_idx, i]) + if self.ngram > 0: + bans = self.cached_beam_ngram_bans[batch_idx][next_indices[batch_idx, i]].copy() + # TODO ngram=1 + ngram_prefix = tuple(tokens[batch_idx, next_indices[batch_idx, i], -(self.ngram - 1):].tolist()) + bans[ngram_prefix] = bans.get(ngram_prefix, tuple()) + (next_tokens[batch_idx, i],) + bans_continue.append(bans) + else: + break + beam_continue_batch.append(torch.stack(beam_continue)) + mems_continue_batch.append(torch.stack(mems_contiue, dim=1)) + score_continue_batch.append(scores_continue) + self.cached_beam_ngram_bans[batch_idx] = bans_continue + tokens = torch.stack(beam_continue_batch) + mems = torch.stack(mems_continue_batch, dim=1) + self.cached_beam_scores = torch.tensor(score_continue_batch, device=logits.device) + self.length_generated += 1 + for batch_idx in range(self.batch_size): + if batch_idx >= batch_size: + self._is_done[batch_idx] = True + elif ( + len(self.end_beams[batch_idx]) == self.num_beams + and self.end_beams_penalized_scores[batch_idx][-1] + >= self.cached_beam_scores[batch_idx].max() / ((5.0 + (seq_len + 1)) / 6) ** self.length_penalty + ): # We're done if none of current tokens will better than the worst in end_beams + self._is_done[batch_idx] = True + + return tokens, mems + + def finalize(self, tokens, mems): + if self.consider_end: + batch_size, num_beams = tokens.shape[:2] + for batch_idx in range(batch_size): + if not self._is_done[batch_idx]: + for i in range(num_beams): + self._add_end_beams(self.cached_beam_scores[batch_idx, i], tokens[batch_idx, i], batch_idx) + mems = None + ret = self.end_beams[:batch_size] + else: + ret = tokens + self._init_cache() + return ret, mems diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/initialize.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/initialize.py new file mode 100644 index 0000000000000000000000000000000000000000..147fce6a719cff58fae0e7e4f1725a4b337a6a77 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/initialize.py @@ -0,0 +1,118 @@ +import argparse +import torch +import time +import torch_npu + +from quantization import quantize + +from SwissArmyTransformer import get_args, get_tokenizer +from SwissArmyTransformer.arguments import initialize_distributed +from SwissArmyTransformer.training import load_checkpoint +from SwissArmyTransformer.model import GLM130B +from SwissArmyTransformer.mpu import get_model_parallel_world_size, get_model_parallel_rank, get_model_parallel_group + + +def add_bminf_args(parser): + """Arguments for BMInf""" + group = parser.add_argument_group("BMInf") + + group.add_argument("--bminf", action="store_true", help="Use BMInf to support low resource evaluation") + group.add_argument("--bminf-memory-limit", type=int, default=20, help="Max memory for model per GPU (in GB)") + return parser + + +def add_quantization_args(parser): + group = parser.add_argument_group("Quantization") + + group.add_argument("--quantization-bit-width", type=int, default=None) + group.add_argument("--from-quantized-checkpoint", action="store_true", help="Loading from a quantized checkpoint") + + +def add_initialization_args(parser): + group = parser.add_argument_group("Initialization") + + group.add_argument( + "--sequential-initialization", + action="store_true", + help="Initialize sequentially in tensor parallel group (reduce CPU RAM for initialization)", + ) + + +def initialize(extra_args_provider): + parser = argparse.ArgumentParser(add_help=False) + add_bminf_args(parser) + add_quantization_args(parser) + add_initialization_args(parser) + GLM130B.add_model_specific_args(parser) + extra_args_provider(parser) + known, args_list = parser.parse_known_args() + args = get_args(args_list) + args = argparse.Namespace(**vars(args), **vars(known)) + args.do_train = False + initialize_distributed(args) + return args + + +def initialize_model_and_tokenizer(args): + tokenizer = get_tokenizer(args) + + torch.distributed.barrier() + start = time.time() + + for i in range(get_model_parallel_world_size()): + if get_model_parallel_rank() == i: + # Initialize model + model = GLM130B(args).half() + + if args.from_quantized_checkpoint: + assert args.quantization_bit_width is not None + # Quantize model before moving to GPU + model = quantize(model, args.quantization_bit_width) + + # Load checkpoint + load_checkpoint(model, args) + + if args.quantization_bit_width is not None and not args.from_quantized_checkpoint: + # Quantize model before moving to GPU + model = quantize(model, args.quantization_bit_width) + + if args.bminf: + import bminf + + if torch.distributed.get_rank() == 0: + print(f"> BMInf activated, memory limit: {args.bminf_memory_limit} GB") + with torch.cuda.device(args.device): + model = bminf.wrapper(model, quantization=False, memory_limit=args.bminf_memory_limit << 30) + else: + torch.npu.set_device(torch_npu.npu.current_device()) + model = model.to(args.device) + if args.sequential_initialization: + torch.distributed.barrier(group=get_model_parallel_group()) + + torch.distributed.barrier() + if torch.distributed.get_rank() == 0: + print(f"> Model initialized in {time.time() - start:.1f}s") + + torch.cuda.empty_cache() + model.eval() + + # generate rotary embedding cache + original_parallel_output = model.transformer.parallel_output + model.transformer.parallel_output = True + with torch.no_grad(): + _, *_ = model( + torch.ones(1, args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64), + torch.arange(args.max_sequence_length, device=torch.cuda.current_device(), dtype=torch.int64).view(1, -1), + torch.randn( + 1, + 1, + args.max_sequence_length, + args.max_sequence_length, + device=torch.cuda.current_device(), + ) + < 0.5, + ) + model.transformer.parallel_output = original_parallel_output + torch.distributed.barrier() + + return model, tokenizer diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/input.txt b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/input.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f547aa34aa7bdc8e2125fe23b91f65e3991705f --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/input.txt @@ -0,0 +1,11 @@ +The greatest artist is [gMASK] +Who is the greatest artist? [gMASK] +Who is the greatest artist? The greatest artist is [gMASK] +Who is the greatest artist is [gMASK] +The greatest artist is [gMASK] +Who is the greatest artist [gMASK] +Who is the greatest artist? The greatest artist is [gMASK] +Who is the greatest artist? The greatest artist is [gMASK] +问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答:[gMASK] +问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答:[gMASK] +问题:冬天,中国哪座城市最适合避寒?问题描述:能推荐一些国内适合冬天避寒的城市吗?回答用户:旅游爱好者 回答:[gMASK] \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/kernels/__init__.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/kernels/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6037536fa515968a04253b7d67dd859fa5cb3310 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/kernels/__init__.py @@ -0,0 +1,99 @@ +import pkg_resources +import torch +import ctypes + +from typing import List +from cpm_kernels.kernels.base import LazyKernelCModule, KernelFunction, round_up + +RESOURCE_PACKAGE_NAME = __name__ + + +class Kernel: + def __init__(self, filename: str, function_names: List[str]): + filename = filename + ".fatbin" + if not pkg_resources.resource_exists(RESOURCE_PACKAGE_NAME, filename): + raise RuntimeError("File `%s` not found in `%s`" % (filename, RESOURCE_PACKAGE_NAME)) + self.filename = filename + self.code = pkg_resources.resource_string(RESOURCE_PACKAGE_NAME, filename) + self._function_names = function_names + self._cmodule = LazyKernelCModule(self.code) + + for name in self._function_names: + setattr(self, name, KernelFunction(self._cmodule, name)) + + +kernels = Kernel( + "quantization", + [ + "int4WeightCompression", + "int4WeightExtractionFloat", + "int4WeightExtractionHalf", + "int8WeightExtractionFloat", + "int8WeightExtractionHalf", + ], +) + + +def compress_int4_weight(weight: torch.Tensor): # (n, m) + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + assert m % 2 == 0 + m = m // 2 + out = torch.empty(n, m, dtype=torch.int8, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + kernels.int4WeightCompression( + gridDim, + blockDim, + 0, + stream, + [ctypes.c_void_p(weight.data_ptr()), ctypes.c_void_p(out.data_ptr()), ctypes.c_int32(n), ctypes.c_int32(m)], + ) + return out + + +def extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): + if source_bit_width == 8: + func = kernels.int8WeightExtractionHalf + elif source_bit_width == 4: + func = kernels.int4WeightExtractionHalf + else: + assert False, "Unsupported bit-width" + + with torch.cuda.device(weight.device): + n, m = weight.size(0), weight.size(1) + out = torch.empty(n, m * (8 // source_bit_width), dtype=torch.half, device="cuda") + stream = torch.cuda.current_stream() + + gridDim = (n, 1, 1) + blockDim = (min(round_up(m, 32), 1024), 1, 1) + + func( + gridDim, + blockDim, + 0, + stream, + [ + ctypes.c_void_p(weight.data_ptr()), + ctypes.c_void_p(scale_list.data_ptr()), + ctypes.c_void_p(out.data_ptr()), + ctypes.c_int32(n), + ctypes.c_int32(m), + ], + ) + return out + + +if __name__ == "__main__": + weight = torch.randn(4, 32).to(torch.int8).cuda() + scale = torch.ones(weight.size(0)).to(torch.half).cuda() + + print(weight) + b = compress_int4_weight(weight) + print(b) + + a = extract_weight_to_half(b, scale, source_bit_width=4) + print(a) diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/kernels/quantization.fatbin b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/kernels/quantization.fatbin new file mode 100644 index 0000000000000000000000000000000000000000..03f4431928aa1814e4ffcd91d61ee72b88157b68 Binary files /dev/null and b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/kernels/quantization.fatbin differ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/logs/README.md b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/logs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..04aa8a809b6eeccc16b4a52b70ff9913ff355ca9 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/logs/README.md @@ -0,0 +1,5 @@ +# Training Logs + +`main-log.md` contains detailed information about each restart of training during GLM-130B training. + +Tensorboard logs is available at [here](https://cloud.tsinghua.edu.cn/f/503ef9fa85b84fbba9ef/). diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/logs/main-log-en.md b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/logs/main-log-en.md new file mode 100644 index 0000000000000000000000000000000000000000..707a8875b59d091023653902cf223f8ecaff5881 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/logs/main-log-en.md @@ -0,0 +1,251 @@ +# The training notes of GLM-130B + +## Basic Information about GLM-130B + +- 130B:70 layers,12288 hidden size,32768 ffn hidden size, 150000 vocab size + - MP = 4, PP = 8 +- GLM + Rotary Positional Embedding + GeGLU + DeepNorm +- FP32 softmax with QKV scaling(no PB-Relax) +- Shrink embedding gradient with $\alpha=0.1$ +- Global batch size: 4224 + +## Environment + +- PyTorch 1.11 / CUDA 11.3 +- LargeScale@400893da37bb5cbe22c29e41c02a052369cc72ce +- DeepSpeed 0.6.1 +- apex@master + +## Speed Testing (with Different Batch Sizes) + +- 96 nodes, BSZ=176 * 24=4224 + - glm-130B-2022.05.05-19:34:16:134TFLOPS, 88.5s/iter, 48samples/s, +- 96 nodes, BSZ=256 * 24=6144 + - glm-130B-2022.05.05-19:43:13:141TFLOPS, 122.5s/iter, 50samples/s + +## 2022-05-06 04:00 Training starts + +- glm-130B-2022.05.05-19:53:15 + +## 2022-05-07 20:14 Node failure + +n30041, n30157 break down, changing saving interval to 100 steps (originally 500 steps, too long), restart from 4000 step + +- glm-130B-2022.05.07-13:44:59 + +## 2022-05-10 00:00 Increase alpha for embedding shrink, as we think the original alpha is too small (originally 0.1) + +add `--shrink-embedding-gradient-steps 6000 500` to warmup alpha to 1 from 6000 step within 500 steps + +- glm-130B-2022.05.09-16:02:04 + +## 2022-05-11 12:13 Node failure + +n30115 breaks down, restart from 7300 step + +- glm-130B-2022.05.11-05:55:32 + +## 2022-05-20 00:03 Node failure + +n30066 breaks down, restart from 15400 step + +- glm-130B-2022.05.19-19:56:19 + +Switch to another node pool, and restart from 15600 step + +- glm-130B-2022.05.20-01:58:57 + +## 2022-05-21 12:40 Replace node + +Finding that the training flop is only 127T, smaller than before; suspecting that the n30076 we have replaced in has some unknown errors and kicking it out from 16600 step; nothing changes + +## 2022-05-22 19:27 Node failure + +n30126 loses connection + +- glm-130B-2022.05.22-14:15:41 + +## 2022-05-26 04:30 Node failure + +n30039 reports missing GPUs + +- glm-130B-2022.05.25-22:23:12 + + +## 2022-05-28 11:50 Change Multi-task Instruction Pre-training (MIP) data (abolished) + +Restarts from 22800 step, change MIP data to the correct one (English & Chinese) + +- glm-130B-2022.05.28-03:52:26 +- events.out.tfevents.1653709957.9droa42ltcad5-0.1858.0 (abolished) + +## 2022-05-28 16:50 Change MIP data + +New MIP data (English & Chinese) leads to NaN loss at 22900 step; finding too much noises in Chinese multi-task data; switch to vanilla T0 training datasets + +- glm-130B-2022.05.28-09:18:12 +- events.out.tfevents.1653729502.9droa42ltcad5-0.5648.0(移除) + +## 2022-05-28 20:50 Add warmup (abolished) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C850748B-92A4-4F9F-932F-AD22330895D6_2/E8MboG8vrTTb2N51FRhkb6wsB4eyrD77USmM992obQgz/Image.png) + +Vanilla T0 datasets still lead to disconvergence; suspecting a changed task ratio leads to the instability; add argument `--warmup-samples-after-loading 2112000` to warmup 500 steps from 22800 step + +- glm-130B-2022.05.28-12:57:24 +- events.out.tfevents.1653742654.9droa42ltcad5-0.7942.0(移除) + +## 2022-05-29 01:30 Disconverges again, switch to self-supervised pre-training only (abolished) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/028DE014-00FE-4521-BEEB-EF3F61BB8DA1_2/mgYybTR1OLgPkBysqMiUgGYNyIg8OQnf1yXI66grBeMz/Image.png) + +- Disconverges after warmup; suspecting that the distribution change is still too large; trying to restart using self-supervised pre-training only with data reshuffle, loading from 22800 step +- glm-130B-2022.05.28-18:05:33 +- events.out.tfevents.1653761143.9droa42ltcad5-0.9744.0 (abolished) +- global_step23200_text ++ Configuration file + +## 2022-05-29 Smoothing distribution shift (abolished) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E2BC463F-E519-461E-B1B0-99551DA940BE_2/0ZqN22TLyqRTvqOy6JNLeixEy4TarDJEF7DOvdh3saIz/Image.png) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/9C7AC4B3-59AB-471A-872E-41CCBAE7E90D_2/0rpEmyAOcIkLyDGR2R4RQiBeUwbWIWiaHbHcwosx6yAz/Image.png) + +Self-supervised pre-training only seems to be stable; trying to smooth the distribution shift via a warmed-up ratio of correct T0 data from 22800 step + +- glm-130B-2022.05.29-05:17:06 +- events.out.tfevents.1653801436.9droa42ltcad5-0.13868.0 (abolished) + +## 2022-05-29 22:40 Smoothing data distribution shift & warmup learning rate + +- Disconverges; suspecting that learning rate requires warmup in this process, too + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/F5532A86-3AAC-4CCE-AC9B-A976B7736D7F_2/M4JZx5GYzNPuysPHXrn0R5Oo54rBhDwQxdErkOpFOhEz/Image.png) + +- Restart from 22800, warmup correct MIP data ratio and learning rate for 2000 steps; warmup embedding gradient shrink alpha from 0.2 to 1 by 6000 steps +- glm-130B-2022.05.29-17:35:45 + +## 2022-05-30 14:00 Node and file system failure + +Finding the warmup steps for embedding gradient shrink to be wrong (26850 steps instead of 6000 steps); changing the warmup steps implementation (according to the absolute number of samples); restarting from global_step23200 + +We discover that the restart is stacked in the data loading, which turns out to be an error of the Lustre file system. The result is that we cannot read the 2.3T text corpora and the engineer cannot help to recover the data, and we have to copy data from backup disk to the file system again (which takes few days) + +- glm-130B-2022.05.31-02:18:24 + +## 2022.05.03 20:00 Add DeepStruct data to MIP + +- Keeping the original warmup process; adding DeepStruct data to MIP portion; restart from 23500 step + +## 2022-06-01 22:22 Replace MIP data to a cleaner version + +Finding one noisy prompt in the task data for T0 (qqp) and DeepStruct respectively; removing them and restarting from 24500 step + +- glm-130B-2022.06.01-14:24:33 + +## 2022-06-02 12:00 Node failure + +- n30145 CPU error, restarting from 25000 step; removing the warmup process as it has ended +- glm-130B-2022.06.02-04:35:05 + +## 2022-06-02 09:30 Start to print multitask loss + +From 25800 step, we print multitask loss + +- glm-130B-2022.06.03-01:40:12 + +## 2022-06-02 15:00 Reduce learning rate and print gpt/bert loss + +The loss decreases slowly, and we think it might be attributed to a too large learning rate; from 26000 step, we half the learning rate + +- glm-130B-2022.06.03-07:26:16 + +## 2022-06-06 17:00 Node cluster maintenance + +The node cluster needs an upgrade from 9 am to 5 am + +- glm-130B-2022.06.06-10:00:39 + +PS: we observe a significant improvement of the file system's reading speed; only need 1 minute to load the checkpoint now + +## 2022-06-08 08:00 Node failure + +- glm-130B-2022.06.08-00:00:37 + +## 2022-06-09 13:30 Unexpected termination of the training + +Restarting from 23100 step; suspecting the network communication problem + +- glm-130B-2022.06.09-05:27:54 + +## 2022-06-12 10:00 Loss explodes + +From 33700 step, the training loss explodes. The loss-scale reduces drastically around 33710 step, and the loss explodes at 33740 step + +- tensorboard record:glm-130B-33700 + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C46C7CFE-1B79-491C-90FC-5A88AE90E9DF_2/7ICMyH8v6GhAgngz5bVaDKwzYjFPyk99Ax27R5w56wMz/Image.png) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E56BCDE0-C798-429F-81E0-1A07CCB9BC0E_2/Ig2rfKnPmLadg39Jc38UEdK90LDxlAxoH0AxmAygxzAz/Image.png) + +- Restaring from 33600 step, reduce shrink embedding gradient from 1.0 to 0.5 +- glm-130B-2022.06.12-02:20:49 + +## 2022-06-14 03:00 Loss explodes + +At 35250 step, the loss explodes again; almost the same behavior as it is in 33700 step; breaking down without any signs + +tensorboard record:glm-130B-35250 + +- Restarting from 35200 step, and shrinking embedding gradient from 0.5 to 0.1 +- glm-130B-2022.06.14-02:28:21 + +## 2022-06-19 00:10 Node failure + +n30085 breaks down, restarting from 39600 step + +- glm-130B-2022.06.18-17:49:53 + +## 2022-06-20 09:10 Loss explodes + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/CA344108-3B01-469C-9ABE-C41002F76484_2/oEvBST5MP0I7S4qHmQUeE7DoPCsGFSrveAOOSyitSUwz/Image.png) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/FED0DE40-A710-4259-AE98-26BCB9568C7A_2/kH4FijsPDVJFzkbaxz7BiX0RZrul1Wrye6cE5EV8ZG0z/Image.png) + +- tensorboard record:glm-130B-40800 +- `--skip-train-iteration-range 40701-40900` +- Restarting from 40700 step and skipping the noisy data in 40701-40900 steps +- glm-130B-2022.06.20-03:36:13 + +## 2022-06-22 10:40 Gradient spikes + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/0B7E0A0C-4B11-4F52-BF10-E6B11A533BEF_2/yb1zC07di9zux8jbAi15gpqlstGHXZyjyMBEjO0gNKUz/Image.png) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/1C60424A-0290-4070-9327-DF9DFD135020_2/XyVoPs1yMLIuzUyrDixSYfgjc2Y2Nuor20GCz0nSPkAz/Image.png) + +- The gradient norm experiences a spike, which seems to recover automatically; but the training loss experiences a drastic change +- `--skip-train-iteration-range 40701-40900` +- Restarting from 42400 and skipping data in 42401-42600 steps +- glm-130B-2022.06.22-02:38:20 + +## 2022-06-22 21:00 Gradient spikes + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/1D7D801C-3226-4CB0-978C-F19B4DA46721_2/nmg9r87OFrdErZvY9xjiDIHvgPVLv39vy8ZVtGkj2H0z/Image.png) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/5F5CA3D6-AF58-4087-9806-1529D3A2EF6C_2/WSQqyBdv1rvzvNloXE6Ssql7GxMDoULU38FAQCv3778z/Image.png) + +- The gradient norm experiences a spike again, but the loss-scale seems stable. We think it might recover automatically. +- Rethinking on the repeating gradient spikes in recent days, we speculate it might be attributed to a too-slow learning rate decay in the late stage of pre-training; reducing minimum lr from 8e-6 to 4e-6 +- `--min-lr 4e-6` +- Restarting from 42700 step +- glm-130B-2022.06.22-13:03:53 + +## 2022.06.26 16:00 Node failure + +- Unexpected NVLink Error; restarting training +- glm-130B-2022.06.26-13:13:51 + +## 2022.06.29 00:00 Recover position_id + +- Restarting training from 48100 step; using another more consistent positional encoding (the original one has a different implementation for \[MASK\] and \[gMASK\]) +- glm-130B-2022.06.29-13:53:21 diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/logs/main-log.md b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/logs/main-log.md new file mode 100644 index 0000000000000000000000000000000000000000..eabdd4993247b5bf8dc177df1dde5f467a30f850 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/logs/main-log.md @@ -0,0 +1,251 @@ +# GLM-130B 训练日志 + +## 模型信息 + +- 130B:70 layers,12288 hidden size,32768 ffn hidden size, 150000 vocab size + - MP = 4, PP = 8 +- GLM + Rotary Positional Embedding + GeGLU + DeepNorm +- FP32 softmax with QKV scaling(no PB-Relax) +- Shrink embedding gradient with $\alpha=0.1$ +- Global batch size: 4224 + +## 环境版本 + +- PyTorch 1.11 / CUDA 11.3 +- LargeScale@400893da37bb5cbe22c29e41c02a052369cc72ce +- DeepSpeed 0.6.1 +- apex@master + +## 测速 + +- 96 nodes, BSZ=176 * 24=4224 + - glm-130B-2022.05.05-19:34:16:134TFLOPS, 88.5s/iter, 48samples/s, +- 96 nodes, BSZ=256 * 24=6144 + - glm-130B-2022.05.05-19:43:13:141TFLOPS, 122.5s/iter, 50samples/s + +## 2022-05-06 04:00 开始训练 + +- glm-130B-2022.05.05-19:53:15 + +## 2022-05-07 20:14 节点故障 + +坏掉 n30041, n30157 两个点,更改保存间隔为 100step,从 4000 step 开始训练 + +- glm-130B-2022.05.07-13:44:59 + +## 2022-05-10 00:00 提升 alpha + +加入 `--shrink-embedding-gradient-steps 6000 500` 从 6000 step 开始训练 + +- glm-130B-2022.05.09-16:02:04 + +## 2022-05-11 12:13 节点故障 + +坏掉 n30115 节点,从 7300 step 开始训练 + +- glm-130B-2022.05.11-05:55:32 + +## 2022-05-20 00:03 节点故障 + +坏掉 n30066 节点,从 15400 step 开始训练 + +- glm-130B-2022.05.19-19:56:19 + +再换一批节点,从 15600 step 开始训练 + +- glm-130B-2022.05.20-01:58:57 + +## 2022-05-21 12:40 换节点 + +训练效率一直只有 127T 左右,怀疑之前加入的 n30076 存在问题,踢出后从 16600 step 开始训练,似乎不解决问题。 + +## 2022-05-22 19:27 节点故障 + +n30126 失联 + +- glm-130B-2022.05.22-14:15:41 + +## 2022-05-26 04:30 节点故障 + +n30039 掉卡 + +- glm-130B-2022.05.25-22:23:12 + + +## 2022-05-28 11:50 更换中英多任务数据(废除) + +从 22800 开始训练,换中英多任务数据 + +- glm-130B-2022.05.28-03:52:26 +- events.out.tfevents.1653709957.9droa42ltcad5-0.1858.0(移除) + +## 2022-05-28 16:50 更换英文多任务数据(废除) + +换新的多任务数据 22900 左右出现 nan,挂掉训练,检查发现中文多任务数据噪声极大,从 22800 换成平衡后的 t0 原始数据开始训练 + +- glm-130B-2022.05.28-09:18:12 +- events.out.tfevents.1653729502.9droa42ltcad5-0.5648.0(移除) + +## 2022-05-28 20:50 加入 warmup(废除) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C850748B-92A4-4F9F-932F-AD22330895D6_2/E8MboG8vrTTb2N51FRhkb6wsB4eyrD77USmM992obQgz/Image.png) + +换上平衡后且不泄漏的 t0 原始数据开始训练仍然有问题,推测是平衡后一些任务占比变大,其实等价于加入新任务的情况,加入参数 `--warmup-samples-after-loading 2112000` warmup 500 步从 22800 开始训练 + +- glm-130B-2022.05.28-12:57:24 +- events.out.tfevents.1653742654.9droa42ltcad5-0.7942.0(移除) + +## 2022-05-29 01:30 再次爆炸,换纯文本(废除) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/028DE014-00FE-4521-BEEB-EF3F61BB8DA1_2/mgYybTR1OLgPkBysqMiUgGYNyIg8OQnf1yXI66grBeMz/Image.png) + +- warmup 以后还是炸了,分析可能是 distribution 变动仍然太过剧烈,先换纯文本 + reshuffle 尝试训练,从 22800 加载 +- glm-130B-2022.05.28-18:05:33 +- events.out.tfevents.1653761143.9droa42ltcad5-0.9744.0(废除) +- global_step23200_text ++ 配置文件 + +## 2022-05-29 逐渐修改数据分布(废除) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E2BC463F-E519-461E-B1B0-99551DA940BE_2/0ZqN22TLyqRTvqOy6JNLeixEy4TarDJEF7DOvdh3saIz/Image.png) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/9C7AC4B3-59AB-471A-872E-41CCBAE7E90D_2/0rpEmyAOcIkLyDGR2R4RQiBeUwbWIWiaHbHcwosx6yAz/Image.png) + +文本似乎能稳定,那么尝试逐渐平滑修改数据分布, 从 22800 开始,逐渐修改数据分布到 t0 平衡数据 + +- glm-130B-2022.05.29-05:17:06 +- events.out.tfevents.1653801436.9droa42ltcad5-0.13868.0(废除) + +## 2022-05-29 22:40 逐渐修改数据分布并全面 warmup + +- 又挂了,分析可能是换新分布学习率也需要 warmup + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/F5532A86-3AAC-4CCE-AC9B-A976B7736D7F_2/M4JZx5GYzNPuysPHXrn0R5Oo54rBhDwQxdErkOpFOhEz/Image.png) + +- 从 22800 开始训练,数据和 lr 都 warmup 2000 步,shrink embbeding graident 从 0.2 warmup 6000 步到 1 +- glm-130B-2022.05.29-17:35:45 + +## 2022-05-30 14:00 挂节点 + +更改了一下参数配置,发现之前 shrink embedding 的步数写错了(26850 步),现在改成 6000 步。升级了一下 lr auto warmup 的逻辑,写成绝对 samples 数量。从 global_step23200 开始 + +我们发现这次训练卡在了数据加载,排查后发现是 Lustre 文件系统的故障,导致 2.3T 文本数据读不出来,且工程师无法修复;最终重新从移动硬盘拷贝了一次数据 + +- glm-130B-2022.05.31-02:18:24 + +## 2022.05.03 20:00 加 DeepStruct 数据 + +- 维持原有 transform 过程不变,但直接加入 DeepStruct 数据,从 23500 开始 + +## 2022-06-01 22:22 换清洗数据 + +之前的多任务数据 t0 和 deepsturct 各有一个任务的 target 异常,重新清洗后更换,从 24500 开始 + +- glm-130B-2022.06.01-14:24:33 + +## 2022-06-02 12:00 节点故障 + +- n30145 CPU 故障,从 25000 重启训练,lr 和 数据集已经 transfromer 完毕,所以配置直接去掉 warmup +- glm-130B-2022.06.02-04:35:05 + +## 2022-06-02 09:30 加入 multitask loss 打印 + +25800steps 开始,加入 multitask loss 打印 + +- glm-130B-2022.06.03-01:40:12 + +## 2022-06-02 15:00 降低学习率,加入 gpt/bert loss 打印 + +loss 降低比较慢,讨论可能是学习率太大了,26000steps 开始,学习率砍半 + +- glm-130B-2022.06.03-07:26:16 + +## 2022-06-06 17:00 集群维护 + +集群从 9 点到 5 点升级驱动,从 开始训练 + +- glm-130B-2022.06.06-10:00:39 + +PS:观察到共享文件系统读取速度显著改善,现在加载 ckpt 几乎只需要 1 分钟 + +## 2022-06-08 08:00 坏点 + +- glm-130B-2022.06.08-00:00:37 + +## 2022-06-09 13:30 训练卡住 + +23100 开始恢复 + +- glm-130B-2022.06.09-05:27:54 + +## 2022-06-12 10:00 loss 爆炸 + +33700 开始 loss 炸了,loss-scale 在 33710 左右突然下跌然后 loss 在 33740 左右爆炸 + +- tensorboard 记录:glm-130B-33700 + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/C46C7CFE-1B79-491C-90FC-5A88AE90E9DF_2/7ICMyH8v6GhAgngz5bVaDKwzYjFPyk99Ax27R5w56wMz/Image.png) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/E56BCDE0-C798-429F-81E0-1A07CCB9BC0E_2/Ig2rfKnPmLadg39Jc38UEdK90LDxlAxoH0AxmAygxzAz/Image.png) + +- 从 33600 开始加载,shrink embedding gradient 1 → 0.5 +- glm-130B-2022.06.12-02:20:49 + +## 2022-06-14 03:00 loss 爆炸 + +35250 loss 又炸了,和 33700 的表现几乎一样,都是完全没有征兆突然爆炸 + +tensorboard 记录:glm-130B-35250 + +- 从 35200 开始加载,shrink embedding gradient 0.5 → 0.1 +- glm-130B-2022.06.14-02:28:21 + +## 2022-06-19 00:10 节点故障 + +n30085 挂了,从 39600 恢复 + +- glm-130B-2022.06.18-17:49:53 + +## 2022-06-20 09:10 loss 爆炸 + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/CA344108-3B01-469C-9ABE-C41002F76484_2/oEvBST5MP0I7S4qHmQUeE7DoPCsGFSrveAOOSyitSUwz/Image.png) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/FED0DE40-A710-4259-AE98-26BCB9568C7A_2/kH4FijsPDVJFzkbaxz7BiX0RZrul1Wrye6cE5EV8ZG0z/Image.png) + +- tensorboard 记录:glm-130B-40800 +- `--skip-train-iteration-range 40701-40900` +- 从 40700 开始重新加载并跳过 40701-40900 数据 +- glm-130B-2022.06.20-03:36:13 + +## 2022-06-22 10:40 梯度 spike + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/0B7E0A0C-4B11-4F52-BF10-E6B11A533BEF_2/yb1zC07di9zux8jbAi15gpqlstGHXZyjyMBEjO0gNKUz/Image.png) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/A8DAC1A6-2A03-489A-8A11-BFAFFFEE3905/1C60424A-0290-4070-9327-DF9DFD135020_2/XyVoPs1yMLIuzUyrDixSYfgjc2Y2Nuor20GCz0nSPkAz/Image.png) + +- grad 有点小 spike,看起来后续恢复了,但 loss 似乎遇到了比较大的波动 +- `--skip-train-iteration-range 40701-40900` +- 从 42400 开始重新加载并跳过 42401-42600 数据 +- glm-130B-2022.06.22-02:38:20 + +## 2022-06-22 21:00 梯度 spike + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/1D7D801C-3226-4CB0-978C-F19B4DA46721_2/nmg9r87OFrdErZvY9xjiDIHvgPVLv39vy8ZVtGkj2H0z/Image.png) + +![Image.png](https://res.craft.do/user/full/97ed555f-7125-cca2-fd7d-9f1a0585132e/doc/E406CC41-4180-4108-BCCF-5E727CEB8F09/5F5CA3D6-AF58-4087-9806-1529D3A2EF6C_2/WSQqyBdv1rvzvNloXE6Ssql7GxMDoULU38FAQCv3778z/Image.png) + +- grad 又有 spike,但是 loss-scale 没有一降到底,推测应该可以恢复 +- 这几天的反复 spike,我们分析可能是后期 learning rate 降低太慢,将 min-lr 从 8e-6 调整到 4e-6 +- `--min-lr 4e-6` +- 从 42700 加载开始训练 +- glm-130B-2022.06.22-13:03:53 + +## 2022.06.26 16:00 节点故障 + +- 节点 NVLink Error,重启训练 +- glm-130B-2022.06.26-13:13:51 + +## 2022.06.29 00:00 恢复 position_id + +- 48100 从原先配置开始训练 +- glm-130B-2022.06.29-13:53:21 diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/__init__.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..962c6f388ff409fb46705501042bc52fac79d54f --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/__init__.py @@ -0,0 +1,65 @@ +import torch + +from .layers import QuantizedColumnParallelLinear +from .layers import QuantizedRowParallelLinear +# from .ascend_layers import QuantizedColumnParallelLinear +# from .ascend_layers import QuantizedRowParallelLinear + + +def quantize(model, weight_bit_width): + """Replace fp16 linear with quantized linear""" + + if torch.distributed.get_rank() == 0: + print(f"> Quantizing model weight to {weight_bit_width} bits") + + for layer in model.transformer.layers: + layer.attention.query_key_value = QuantizedColumnParallelLinear( + weight_bit_width=weight_bit_width, + weight=layer.attention.query_key_value.weight.to(torch.cuda.current_device()), + input_size=layer.attention.query_key_value.input_size, + output_size=layer.attention.query_key_value.output_size, + bias=True, + gather_output=False, + params_dtype=torch.half, + name="query_key_value", + skip_init=True, + device=layer.attention.query_key_value.weight.device, + ) + layer.attention.dense = QuantizedRowParallelLinear( + weight_bit_width=weight_bit_width, + weight=layer.attention.dense.weight.to(torch.cuda.current_device()), + input_size=layer.attention.dense.input_size, + output_size=layer.attention.dense.output_size, + bias=True, + input_is_parallel=True, + params_dtype=torch.half, + name="dense", + skip_init=True, + device=layer.attention.dense.weight.device, + ) + layer.mlp.dense_h_to_4h = QuantizedColumnParallelLinear( + weight_bit_width=weight_bit_width, + weight=layer.mlp.dense_h_to_4h.weight.to(torch.cuda.current_device()), + input_size=layer.mlp.dense_h_to_4h.input_size, + output_size=layer.mlp.dense_h_to_4h.output_size, + bias=True, + gather_output=False, + params_dtype=torch.half, + name="dense_h_to_4h", + skip_init=True, + device=layer.mlp.dense_h_to_4h.weight.device, + ) + layer.mlp.dense_4h_to_h = QuantizedRowParallelLinear( + weight_bit_width=weight_bit_width, + weight=layer.mlp.dense_4h_to_h.weight.to(torch.cuda.current_device()), + input_size=layer.mlp.dense_4h_to_h.input_size, + output_size=layer.mlp.dense_4h_to_h.output_size, + bias=True, + input_is_parallel=True, + params_dtype=torch.half, + name="dense_h_to_4h", + skip_init=True, + device=layer.mlp.dense_4h_to_h.weight.device, + ) + + return model diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_functional.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_functional.py new file mode 100644 index 0000000000000000000000000000000000000000..3bf95f6913e0f48c321d6a8de45b34ffdc3f4a97 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_functional.py @@ -0,0 +1,26 @@ +import torch + +from .ascend_quant_tool import ascend_extract_weight_to_half + + +class W8A16Linear(torch.autograd.Function): + @staticmethod + def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width): + ctx.inp_shape = inp.size() + ctx.weight_shape = quant_w.size() + ctx.weight_bit_width = weight_bit_width + out_features = quant_w.size(0) + inp = inp.contiguous().view(-1, inp.size(-1)) + weight = ascend_extract_weight_to_half(quant_w, scale_w, weight_bit_width) + output = inp.mm(weight.t()) + ctx.save_for_backward(inp, quant_w, scale_w) + return output.view(*(ctx.inp_shape[:-1] + (out_features,))) + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + inp, quant_w, scale_w = ctx.saved_tensors + weight = ascend_extract_weight_to_half(quant_w, scale_w, weight_bit_width) + grad_output = grad_output.contiguous().view(-1, weight.size(0)) + grad_input = grad_output.mm(weight) + grad_weight = grad_output.t().mm(inp) + return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_layers.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..3b96d47634a4df454bd6f99edffc6ceee0ffb976 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_layers.py @@ -0,0 +1,87 @@ +import torch +from torch.nn.parameter import Parameter + +from SwissArmyTransformer.mpu import copy_to_model_parallel_region +from SwissArmyTransformer.mpu import gather_from_model_parallel_region +from SwissArmyTransformer.mpu import reduce_from_model_parallel_region +from SwissArmyTransformer.mpu import scatter_to_model_parallel_region +from SwissArmyTransformer.mpu import ColumnParallelLinear, RowParallelLinear + +from .ascend_functional import W8A16Linear +from .ascend_quant_tool import compress_int4_weight + + +class QuantizedColumnParallelLinear(ColumnParallelLinear): + def __init__(self, weight_bit_width: int, weight=None, *args, **kwargs): + super(QuantizedColumnParallelLinear, self).__init__(*args, **kwargs) + self.weight_bit_width = weight_bit_width + + shape = self.weight.shape + del self.weight + + if weight is None: + self.weight = torch.empty( + shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"] + ) + self.weight_scale = torch.empty(shape[0], dtype=kwargs["params_dtype"], device=kwargs["device"]) + else: + self.weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half() + self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) + if weight_bit_width == 4: + self.weight = compress_int4_weight(self.weight) + + self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False) + self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False) + + def forward(self, input_): + # Set up backprop all-reduce. + input_parallel = copy_to_model_parallel_region(input_) + # Matrix multiply. + output_parallel = W8A16Linear.apply(input_parallel, self.weight, self.weight_scale, self.weight_bit_width) + if self.bias is not None: + output_parallel = output_parallel + self.bias + if self.gather_output: + # All-gather across the partitions. + output = gather_from_model_parallel_region(output_parallel) + else: + output = output_parallel + return output + + +class QuantizedRowParallelLinear(RowParallelLinear): + def __init__(self, weight_bit_width: int, weight=None, *args, **kwargs): + super(QuantizedRowParallelLinear, self).__init__(*args, **kwargs) + self.weight_bit_width = weight_bit_width + + shape = self.weight.shape + del self.weight + + if weight is None: + self.weight = torch.empty( + shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"] + ) + self.weight_scale = torch.empty(shape[0], dtype=kwargs["params_dtype"], device=kwargs["device"]) + else: + self.weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half() + self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) + if weight_bit_width == 4: + self.weight = compress_int4_weight(self.weight) + + self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False) + self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False) + + def forward(self, input_): + # Set up backprop all-reduce. + if self.input_is_parallel: + input_parallel = input_ + else: + input_parallel = scatter_to_model_parallel_region(input_) + # Matrix multiply. + output_parallel = W8A16Linear.apply(input_parallel, self.weight, self.weight_scale, self.weight_bit_width) + # All-reduce across all the partitions. + output_ = reduce_from_model_parallel_region(output_parallel) + if self.bias is not None: + output = output_ + self.bias + else: + output = output_ + return output diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_quant/Quantization.cpp b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_quant/Quantization.cpp new file mode 100644 index 0000000000000000000000000000000000000000..725e6ea684943246ec4c9c6896cf977b92676fb5 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_quant/Quantization.cpp @@ -0,0 +1,267 @@ +#include +#include +#include +#include +#include +#include +#include +/* + * Quantization + * methods: forward + */ +using namespace sycl; +using namespace bisheng; +using namespace at; +using namespace std; + +template using LocalAccessor = + sycl::accessor; + +template +void int8WeightExtraction_kernel(sycl::queue* Q, int8_t *weight, const sycl::half *scale_list, sycl::half *output, const int n, const int k) { + constexpr size_t size_groups = 32; + constexpr int VECTOR_LENGTH = 12288; + + auto event = Q->launch(size_groups,[weight, scale_list, output, k, n](sycl::group<1> myGroup)[[sycl::noalias]] { + auto group_id = myGroup.get_id(0); + constexpr size_t VEC_NUM = 4; + int repeat_times = n / (size_groups * VEC_NUM); + constexpr size_t dstRepeatStride = 8; + constexpr size_t srcRepeatStride = 4; + int PER_INT8_BLOCKS = k / 8; + int PER_HALF_BLOCKS = k / 4; + int TOTAL_SCALE_BLOCKS = n / 16; + bisheng::vector localVec_odd; + bisheng::vector localScale; + bisheng::vector localTypecast_odd; + int startRow = group_id * repeat_times * VEC_NUM; + int starPtr = startRow * k; + int per_round_offset = VEC_NUM * k; + + dmi::memcpy_blocks(localScale.data(), scale_list, TOTAL_SCALE_BLOCKS); + + for (int i = 0; i < repeat_times; i++) { + dmi::memcpy_blocks(localTypecast_odd.data(), weight + (starPtr + i * per_round_offset), PER_INT8_BLOCKS); + vector_view localVec_total_view(vector_view(localVec_odd.data(), VEC_NUM * k, 1, dstRepeatStride)); + vec_typecast(localVec_total_view, vector_view(localTypecast_odd.data(), VEC_NUM * k, 1, srcRepeatStride), VEC_NUM * k); + for (int j = 0; j < VEC_NUM; j++) { + int offset_Row = i * VEC_NUM + j; + int rows = startRow + offset_Row; + vector_view localVec_odd_view(vector_view(localVec_odd.data() + j * k, k, 1, dstRepeatStride)); + vec_mul(localVec_odd_view, localVec_odd_view, localScale.data()[rows], k); + } + dmi::memcpy_blocks(output + (starPtr + i * per_round_offset), localVec_odd.data(), PER_HALF_BLOCKS); + } + + }); + event.wait(); +} + +template +void int4WeightCompression_kernel(sycl::queue &Q, const int8_t *weight, int8_t *output, const int n, const int k) { + // n: 8192 * k: 12288 + size_t num_cores = n / 2; + + Q.submit([&](sycl::handler &cgh) { + LocalAccessor localAcc_weight(2 * k, cgh); + LocalAccessor localAcc_out(k, cgh); + cgh.parallel_for_work_group(sycl::range<1> { num_cores }, [=](sycl::group<1> grp) { + size_t groupId = grp.get_id(0); + int offset = groupId * k; + // data move + [[loop::parallel]] for (int i = 0; i < k * 2; ++i) { + localAcc_weight[i] = weight[i + offset * 2]; + } + // calculate + [[loop::parallel]] for (int i = 0; i < k; ++i) { + localAcc_out[i] = (localAcc_weight[2 * i] << 4) | (localAcc_weight[2 * i + 1] & 0b00001111); + } + + [[loop::parallel]] for (int i = 0; i < k; ++i) { + output[i + offset] = localAcc_out[i]; + } + }); + }); + Q.wait(); +} + +template +void int4WeightBitshift_kernel(sycl::queue &Q, int8_t *weight, int8_t *high_out, int8_t *low_out, const int n, const int k) { + size_t num_cores = n / 2; + Q.launch(num_cores,[weight, high_out, low_out, k](sycl::group<1> myGroup)[[sycl::noalias]] { + auto group_id = myGroup.get_id(0); + __local int8_t local_weight[2 * k]; + __local int8_t local_high[2 * k]; + __local int8_t local_low[2 * k]; + + dmi::memcpy_blocks(sycl::local_ptr(local_weight).get(), sycl::global_ptr(&weight[2 * group_id * k]).get(), k / 16); + for (int i = 0; i < 2 * k; ++i) { + int8_t original = local_weight[i]; + int8_t high = original >> 4; + int8_t low = original << 4; + low = low >> 4; + local_high[i] = high; + local_low[i] = low; + } + dmi::memcpy_blocks(sycl::global_ptr(&high_out[2 * group_id * k]).get(), sycl::local_ptr(local_high).get(), k / 16); + dmi::memcpy_blocks(sycl::global_ptr(&low_out[2 * group_id * k]).get(), sycl::local_ptr(local_low).get(), k / 16); + }); + Q.wait(); +} + +template +void int4WeightExtraction_kernel(sycl::queue &Q, int8_t *high_weight, int8_t *low_weight, const sycl::half *scale_list, sycl::half *output, const int n, const int k) { + size_t num_cores = n; + const int VECTOR_LENGTH = 16384; + //dmi::memcpy_blocks(localWeight.data(), sycl::global_ptr(&weight[offset]).get(), k / 32); + Q.launch(num_cores,[high_weight, low_weight, scale_list, output, k](sycl::group<1> myGroup)[[sycl::noalias]] { + auto group_id = myGroup.get_id(0); + size_t offset = group_id * k; + bisheng::vector localHighcast; + bisheng::vector localLowcast; + + bisheng::vector localhighMul; + bisheng::vector locallowMul; + bisheng::vector localhighFinal; + bisheng::vector locallowFinal; + __local sycl::half temp_res[2 * k]; + + dmi::memcpy_blocks(localHighcast.data(), sycl::global_ptr(&high_weight[offset]).get(), bisheng::detail::ceil_div(k, 32)); + dmi::memcpy_blocks(localLowcast.data(), sycl::global_ptr(&low_weight[offset]).get(), bisheng::detail::ceil_div(k, 32)); + bisheng::vec_typecast(localhighMul, localHighcast); + bisheng::vec_typecast(locallowMul, localLowcast); + bisheng::vec_mul(localhighFinal, localhighMul, scale_list[group_id], k); + bisheng::vec_mul(locallowFinal, locallowMul, scale_list[group_id], k); + for (int i = 0; i < k; ++i) { + temp_res[2 * i] = *(localhighFinal.data() + i); + temp_res[2 * i + 1] = *(locallowFinal.data() + i); + } + dmi::memcpy_blocks(sycl::global_ptr(&output[2 * offset]).get(), sycl::local_ptr(temp_res).get(), bisheng::detail::ceil_div(k, 8)); + }); + Q.wait(); +} + + +std::unique_ptr g_sycl_queue; +std::once_flag g_sycl_inited; + +void init_sycl_queue() { + aclrtContext acl_context; + aclrtGetCurrentContext(&acl_context); + int device_id = 0; + aclrtGetDevice(&device_id); + auto npu_stream = c10_npu::getCurrentNPUStream(device_id); + auto acl_stream = npu_stream.stream(); + + sycl::context g_sycl_context = sycl::make_context(acl_context); + std::unique_ptr tmp_queue(new sycl::queue(sycl::make_queue(acl_stream, g_sycl_context))); + g_sycl_queue = std::move(tmp_queue); + +} + +template +Tensor int8WeightExtraction_bisheng_launch(const Tensor &weight, const Tensor &scale_list, const int n, const int k) { + int8_t *weight_ptr = static_cast(weight.storage().data_ptr().get()); + const sycl::half *scale_list_ptr = static_cast(scale_list.storage().data_ptr().get()); + Tensor result = at::empty({n, k}, weight.options().dtype(at::kHalf)); + sycl::half *result_ptr = static_cast(result.storage().data_ptr().get()); + + aclrtContext acl_context; + aclrtGetCurrentContext(&acl_context); + int device_id = 0; + aclrtGetDevice(&device_id); + auto npu_stream = c10_npu::getCurrentNPUStream(device_id); + auto acl_stream = npu_stream.stream(); + + std::call_once(g_sycl_inited, init_sycl_queue); + int8WeightExtraction_kernel(g_sycl_queue.get(), weight_ptr, scale_list_ptr, result_ptr, n, k); + return result; +} + +template +Tensor int4WeightCompression_bisheng_launch(const Tensor &weight, Tensor &result, const int n, const int k) { + const int8_t *weight_ptr = static_cast(weight.storage().data_ptr().get()); + int8_t *result_ptr = static_cast(result.storage().data_ptr().get()); + + aclrtContext acl_context; + aclrtGetCurrentContext(&acl_context); + sycl::context sycl_context = sycl::make_context(acl_context); + int device_id = 0; + aclrtGetDevice(&device_id); + auto npu_stream = c10_npu::getCurrentNPUStream(device_id); + auto acl_stream = npu_stream.stream(); + sycl::queue sycl_queue = sycl::make_queue(acl_stream, sycl_context); + + const int8_t *input1 = weight_ptr; + int4WeightCompression_kernel(sycl_queue, input1, result_ptr, n, k); + + return result; +} + +template +std::vector int4WeightBitshift_bisheng_launch(const Tensor &weight, Tensor &high_out, Tensor &low_out, const int n, const int k) { + int8_t *weight_ptr = static_cast(weight.storage().data_ptr().get()); + int8_t *high_ptr = static_cast(high_out.storage().data_ptr().get()); + int8_t *low_ptr = static_cast(low_out.storage().data_ptr().get()); + + aclrtContext acl_context; + aclrtGetCurrentContext(&acl_context); + sycl::context sycl_context = sycl::make_context(acl_context); + int device_id = 0; + aclrtGetDevice(&device_id); + auto npu_stream = c10_npu::getCurrentNPUStream(device_id); + auto acl_stream = npu_stream.stream(); + sycl::queue sycl_queue = sycl::make_queue(acl_stream, sycl_context); + + int8_t *input1 = weight_ptr; + int4WeightBitshift_kernel(sycl_queue, input1, high_ptr, low_ptr, n, k); + + return {high_out, low_out}; +} + +template +Tensor int4WeightExtraction_bisheng_launch(const Tensor &high_weight, const Tensor &low_weight, const Tensor &scale_list, Tensor &result, const int n, const int k) { + int8_t *high_weight_ptr = static_cast(high_weight.storage().data_ptr().get()); + int8_t *low_weight_ptr = static_cast(low_weight.storage().data_ptr().get()); + const sycl::half *scale_list_ptr = static_cast(scale_list.storage().data_ptr().get()); + sycl::half *result_ptr = static_cast(result.storage().data_ptr().get()); + + aclrtContext acl_context; + aclrtGetCurrentContext(&acl_context); + sycl::context sycl_context = sycl::make_context(acl_context); + int device_id = 0; + aclrtGetDevice(&device_id); + auto npu_stream = c10_npu::getCurrentNPUStream(device_id); + auto acl_stream = npu_stream.stream(); + sycl::queue sycl_queue = sycl::make_queue(acl_stream, sycl_context); + + int8_t *input1 = high_weight_ptr; + int8_t *input2 = low_weight_ptr; + const sycl::half *input3 = scale_list_ptr; + int4WeightExtraction_kernel(sycl_queue, input1, input2, input3, result_ptr, n, k); + + return result; +} + +Tensor int8WeightExtraction_bisheng(const Tensor &weight, const Tensor &scale_list, const int n, const int k) { + return int8WeightExtraction_bisheng_launch(weight, scale_list, n, k); +} + +Tensor int4WeightCompression_bisheng(const Tensor &weight, Tensor &result, const int n, const int k) { + return int4WeightCompression_bisheng_launch(weight, result, n, k); +} + +std::vector int4WeightBitshift_bisheng(const Tensor &weight, Tensor &high_out, Tensor &low_out, const int n, const int k) { + return int4WeightBitshift_bisheng_launch(weight, high_out, low_out, n, k); +} + +Tensor int4WeightExtraction_bisheng(const Tensor &high_weight, const Tensor &low_weight, const Tensor &scale_list, Tensor &result, const int n, const int k) { + return int4WeightExtraction_bisheng_launch(high_weight, low_weight, scale_list, result, n, k); +} + +PYBIND11_MODULE(Quantization_bisheng, m) { + m.def("int8WeightExtraction", &int8WeightExtraction_bisheng, "int8 weight extraction"); + m.def("int4WeightCompression", &int4WeightCompression_bisheng, "int4 weight Compression"); + m.def("int4WeightBitshift", &int4WeightBitshift_bisheng, "int4 weight Bitshift"); + m.def("int4WeightExtraction", &int4WeightExtraction_bisheng, "int4 weight Extraction"); +} \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_quant/quantization_bisheng_setup.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_quant/quantization_bisheng_setup.py new file mode 100644 index 0000000000000000000000000000000000000000..adb8fe2662e188b59537c9436cfa9ac909709ae0 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_quant/quantization_bisheng_setup.py @@ -0,0 +1,16 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension + +from torch_npu.utils.cpp_extension import BiShengExtension + + +setup( + name='Quantization_bisheng', + ext_modules=[ + BiShengExtension( + name='Quantization_bisheng', + sources=['Quantization.cpp']), + ], + cmdclass={ + 'build_ext': BuildExtension + }) \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_quant_tool.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_quant_tool.py new file mode 100644 index 0000000000000000000000000000000000000000..5fa9460448d53930c93ef07ad5d5be74bd3a80f0 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/ascend_quant_tool.py @@ -0,0 +1,38 @@ +import torch +# 导入毕昇算子, 注意需要先编译 +import Quantization_bisheng + + +def ascend_compress_int4_weight(weight: torch.Tensor): + """ + weight:按照4bit量化后的权重,实际存储于Int8中,每个Int8存放了一个4bit精度的权重 + out:按照4bit量化后的权重,实际存储与Int8中,每个Int8存放两个4bit精度的权重。out的size是weight的一半 + """ + n, m = weight.size(0), weight.size(1) + assert m % 2 == 0 + compression_out = torch.empty(n, m // 2, dtype=torch.int8, device="npu") + Quantization_bisheng.int4WeightCompression(weight, compression_out, n, m) + return compression_out + + +def ascend_extract_weight_to_half(weight: torch.Tensor, scale_list: torch.Tensor, source_bit_width: int): + """ + weight: 量化后且存储在NPU上的权重 + scale_list: 反量化 scale + extract_out: 反量化后的权重 + """ + if source_bit_width == 8: + n, m = weight.size(0), weight.size(1) + extract_out = Quantization_bisheng.int8WeightExtraction(weight, scale_list, n, m) + return extract_out + + elif source_bit_width == 4: + n, m = weight.size(0), weight.size(1) + extract_out_high = torch.empty(n, m, dtype=torch.half, device="npu") + extract_out_low = torch.empty(n, m, dtype=torch.half, device="npu") + extract_out = torch.empty(n, m*2, dtype=torch.half, device="npu") + Quantization_bisheng.int4WeightBitshift(weight, extract_out_high, extract_out_low, n, m) + Quantization_bisheng.int4WeightExtraction(extract_out_high, extract_out_low, scale_w, extract_out, n, m) + return extract_out + else: + assert False, "Unsupported bit-width" \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/functional.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/functional.py new file mode 100644 index 0000000000000000000000000000000000000000..261bfe6efcea56dc60a76c79e064675a8aab27d8 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/functional.py @@ -0,0 +1,28 @@ +import torch + +# from kernels import extract_weight_to_half + + +class W8A16Linear(torch.autograd.Function): + @staticmethod + def forward(ctx, inp: torch.Tensor, quant_w: torch.Tensor, scale_w: torch.Tensor, weight_bit_width): + ctx.inp_shape = inp.size() + ctx.weight_shape = quant_w.size() + ctx.weight_bit_width = weight_bit_width + out_features = quant_w.size(0) + inp = inp.contiguous().view(-1, inp.size(-1)) + # weight = extract_weight_to_half(quant_w, scale_w, weight_bit_width) + weight = quant_w * scale_w[:, None] + output = inp.mm(weight.t()) + ctx.save_for_backward(inp, quant_w, scale_w) + return output.view(*(ctx.inp_shape[:-1] + (out_features,))) + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + inp, quant_w, scale_w = ctx.saved_tensors + # weight = extract_weight_to_half(quant_w, scale_w, ctx.weight_bit_width) + weight = quant_w * scale_w[:, None] + grad_output = grad_output.contiguous().view(-1, weight.size(0)) + grad_input = grad_output.mm(weight) + grad_weight = grad_output.t().mm(inp) + return grad_input.view(ctx.inp_shape), grad_weight.view(ctx.weight_shape), None diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/layers.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ab3fd2ec5f5e630f2417c11214a189d6682a098 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/quantization/layers.py @@ -0,0 +1,87 @@ +import torch +from torch.nn.parameter import Parameter + +from SwissArmyTransformer.mpu import copy_to_model_parallel_region +from SwissArmyTransformer.mpu import gather_from_model_parallel_region +from SwissArmyTransformer.mpu import reduce_from_model_parallel_region +from SwissArmyTransformer.mpu import scatter_to_model_parallel_region +from SwissArmyTransformer.mpu import ColumnParallelLinear, RowParallelLinear + +from .functional import W8A16Linear +from kernels import compress_int4_weight + + +class QuantizedColumnParallelLinear(ColumnParallelLinear): + def __init__(self, weight_bit_width: int, weight=None, *args, **kwargs): + super(QuantizedColumnParallelLinear, self).__init__(*args, **kwargs) + self.weight_bit_width = weight_bit_width + + shape = self.weight.shape + del self.weight + + if weight is None: + self.weight = torch.empty( + shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"] + ) + self.weight_scale = torch.empty(shape[0], dtype=kwargs["params_dtype"], device=kwargs["device"]) + else: + self.weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half() + self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) + if weight_bit_width == 4: + self.weight = compress_int4_weight(self.weight) + + self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False) + self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False) + + def forward(self, input_): + # Set up backprop all-reduce. + input_parallel = copy_to_model_parallel_region(input_) + # Matrix multiply. + output_parallel = W8A16Linear.apply(input_parallel, self.weight, self.weight_scale, self.weight_bit_width) + if self.bias is not None: + output_parallel = output_parallel + self.bias + if self.gather_output: + # All-gather across the partitions. + output = gather_from_model_parallel_region(output_parallel) + else: + output = output_parallel + return output + + +class QuantizedRowParallelLinear(RowParallelLinear): + def __init__(self, weight_bit_width: int, weight=None, *args, **kwargs): + super(QuantizedRowParallelLinear, self).__init__(*args, **kwargs) + self.weight_bit_width = weight_bit_width + + shape = self.weight.shape + del self.weight + + if weight is None: + self.weight = torch.empty( + shape[0], shape[1] * weight_bit_width // 8, dtype=torch.int8, device=kwargs["device"] + ) + self.weight_scale = torch.empty(shape[0], dtype=kwargs["params_dtype"], device=kwargs["device"]) + else: + self.weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half() + self.weight = torch.round(weight / self.weight_scale[:, None]).to(torch.int8) + if weight_bit_width == 4: + self.weight = compress_int4_weight(self.weight) + + self.weight = Parameter(self.weight.to(kwargs["device"]), requires_grad=False) + self.weight_scale = Parameter(self.weight_scale.to(kwargs["device"]), requires_grad=False) + + def forward(self, input_): + # Set up backprop all-reduce. + if self.input_is_parallel: + input_parallel = input_ + else: + input_parallel = scatter_to_model_parallel_region(input_) + # Matrix multiply. + output_parallel = W8A16Linear.apply(input_parallel, self.weight, self.weight_scale, self.weight_bit_width) + # All-reduce across all the partitions. + output_ = reduce_from_model_parallel_region(output_parallel) + if self.bias is not None: + output = output_ + self.bias + else: + output = output_ + return output diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/requirements.txt b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e30f9bf142b32e6c3f63bb34faebe1df07554a3 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/requirements.txt @@ -0,0 +1,6 @@ +SwissArmyTransformer>=0.2.12 +icetk +apex +scipy +dataclass_wizard +cpm_kernels \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/03DF31017FE184DB45D41DFFC6F80EF0.png b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/03DF31017FE184DB45D41DFFC6F80EF0.png new file mode 100644 index 0000000000000000000000000000000000000000..172b0b75c5caed1433f35ef6be8a564a106486aa Binary files /dev/null and b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/03DF31017FE184DB45D41DFFC6F80EF0.png differ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/33872E48D3539EA132B74BCF5EFF458F.png b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/33872E48D3539EA132B74BCF5EFF458F.png new file mode 100644 index 0000000000000000000000000000000000000000..03baf3ff32b586107e91f3ee5ffb14b639629a57 Binary files /dev/null and b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/33872E48D3539EA132B74BCF5EFF458F.png differ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/7CB441707D1035B2890AA2164C5B6EAC.png b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/7CB441707D1035B2890AA2164C5B6EAC.png new file mode 100644 index 0000000000000000000000000000000000000000..0d245ed75dfb50c9656fa776924299fc02cf40a8 Binary files /dev/null and b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/7CB441707D1035B2890AA2164C5B6EAC.png differ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/7D6433A42D189E2E6FBC62BE066BCE91.png b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/7D6433A42D189E2E6FBC62BE066BCE91.png new file mode 100644 index 0000000000000000000000000000000000000000..48f085a16bb5cf400f126d665b109d3232dcdf44 Binary files /dev/null and b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/7D6433A42D189E2E6FBC62BE066BCE91.png differ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/849024E93FA85347F7F6443932911922.png b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/849024E93FA85347F7F6443932911922.png new file mode 100644 index 0000000000000000000000000000000000000000..d2987b80024ec80bec5a88b9d6b926e730283fea Binary files /dev/null and b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/849024E93FA85347F7F6443932911922.png differ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/AE18F14396E2D22BC0BC8DD77EFD3414.png b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/AE18F14396E2D22BC0BC8DD77EFD3414.png new file mode 100644 index 0000000000000000000000000000000000000000..e42df0ae8206fdabc7628a8b94a60ac7fd8f97e7 Binary files /dev/null and b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/AE18F14396E2D22BC0BC8DD77EFD3414.png differ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/E42321373D22DE198231279B5856BB42.png b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/E42321373D22DE198231279B5856BB42.png new file mode 100644 index 0000000000000000000000000000000000000000..6e92a6ace87d06d17ee02c97e36696c3149d8f8c Binary files /dev/null and b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/E42321373D22DE198231279B5856BB42.png differ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/F48B69263360688CCA21E915F4B1A98B.png b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/F48B69263360688CCA21E915F4B1A98B.png new file mode 100644 index 0000000000000000000000000000000000000000..3f8e359eacef26d818f5558e744a3a265733baa8 Binary files /dev/null and b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/F48B69263360688CCA21E915F4B1A98B.png differ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/WechatGroup.jpeg b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/WechatGroup.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..59acf80fc7250a5bea3bf46643278f9a4250575e Binary files /dev/null and b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/WechatGroup.jpeg differ diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/multitask_list.txt b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/multitask_list.txt new file mode 100644 index 0000000000000000000000000000000000000000..152d86f60d330b736df22c3222e4f4b2c3a82b96 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/resources/multitask_list.txt @@ -0,0 +1,70 @@ +super_glue/wsc.fixed +winogrande/winogrande_xl +super_glue/rte +glue/mrpc +glue/qqp +paws/labeled_final +ai2_arc/ARC_Challenge +ai2_arc/ARC_Easy +kilt_tasks/hotpot_qa +trivia_qa/unfiltered +web_questions +wiki_qa +adversarial_qa/dbidaf +adversarial_qa/dbert +adversarial_qa/droberta +duorc/SelfRC +duorc/ParaphraseRC +ropes +squad_v2 +super_glue/record +quoref +tydiqa +cos_e/v1.11 +cosmos_qa +dream +openbookqa/main +qasc +quail +quarel +quartz +race/high +race/middle +sciq +social_i_qa +super_glue/boolq +super_glue/multirc +wiki_hop/original +wiqa +piqa +amazon_polarity +app_reviews +imdb +rotten_tomatoes +yelp_review_full +super_glue/copa +hellaswag +common_gen +wiki_bio +cnndailymail/3.0.0 +gigaword +multi_news +samsum +xsum +ag_news +dbpedia_14 +trec +super_glue/wic +tacred +conll04 (joint entity relation extraction) +nyt29 (joint entity relation extraction) +ace2005 (joint entity relation extraction) +ade (joint entity relation extraction) +conll03 (named entity recognition) +ontonotes (named entity recognition) +genia (named entity recognition) +conll05 (semantic role labeling) +conll12 (semantic role labeling) +propbank (semantic role labeling) +ace05 (event extraction) +multi_woz_2.1 (dialogue state tracking) diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/benchmark.sh b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/benchmark.sh new file mode 100644 index 0000000000000000000000000000000000000000..fe554cd75eb64b94206e65f566a5cfca8a045f54 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/benchmark.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +script_path=$(realpath $0) +script_dir=$(dirname $script_path) +main_dir=$(dirname $script_dir) + +source "${main_dir}/configs/model_glm_130b.sh" + +ARGS="${main_dir}/benchmark.py \ + --mode inference \ + $MODEL_ARGS" + +TIMESTAMP=$(date +'%Y.%m.%d-%H:%M:%S') +EXP_NAME=${TIMESTAMP} + +mkdir -p logs + +run_cmd="torchrun --nproc_per_node $MP_SIZE ${ARGS}" +echo $run_cmd +eval ${run_cmd} 2>&1 | tee logs/${EXP_NAME}.log diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/evaluate.sh b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/evaluate.sh new file mode 100644 index 0000000000000000000000000000000000000000..0ad62070575efb8742b839951f4b40846929232d --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/evaluate.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +script_path=$(realpath $0) +script_dir=$(dirname $script_path) +main_dir=$(dirname $script_dir) + +source "${main_dir}/configs/model_glm_130b_int8.sh" +# source "${main_dir}/configs/model_glm_130b.sh" + +DATA_PATH="" + +ARGS="${main_dir}/evaluate.py \ + --mode inference \ + --data-path $DATA_PATH \ + --task $* \ + $MODEL_ARGS" + +TIMESTAMP=$(date +'%Y.%m.%d-%H:%M:%S') +EXP_NAME=${TIMESTAMP} + +mkdir -p logs + +run_cmd="torchrun --nproc_per_node $MP_SIZE ${ARGS}" +eval ${run_cmd} 2>&1 | tee logs/${EXP_NAME}.log diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/evaluate_multiple_node.sh b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/evaluate_multiple_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..8337cd77184e525b9dbfe9680b40e6cf59c32956 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/evaluate_multiple_node.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +NUM_WORKERS=16 +NUM_GPUS_PER_WORKER=8 +HOST_FILE_PATH="" +OPTIONS_NCCL="NCCL_IB_DISABLE=0 NCCL_NET_GDR_LEVEL=2 CUDA_LAUNCH_BLOCKING=0" + +script_path=$(realpath $0) +script_dir=$(dirname $script_path) +main_dir=$(dirname $script_dir) + +source "${main_dir}/configs/model_glm_130b.sh" + +DATA_PATH="" + +ARGS="${main_dir}/evaluate.py \ + --mode inference \ + --data-path $DATA_PATH \ + --task $* \ + $MODEL_ARGS" + +TIMESTAMP=$(date +'%Y.%m.%d-%H:%M:%S') +EXP_NAME=${TIMESTAMP} + +mkdir -p logs + +run_cmd="${OPTIONS_NCCL} deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} ${ARGS}" +eval ${run_cmd} 2>&1 | tee logs/${EXP_NAME}.log diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/generate.sh b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/generate.sh new file mode 100644 index 0000000000000000000000000000000000000000..b0162ee99e674e085d5a83ce2ceadc879f8f3494 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/generate.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +script_path=$(realpath $0) +script_dir=$(dirname $script_path) +main_dir=$(dirname $script_dir) + +source "${main_dir}/configs/model_glm_130b_int8.sh" +# source "${main_dir}/configs/model_glm_130b.sh" + +SEED=1234 +MAX_OUTPUT_LENGTH=256 +MIN_GEN_LENGTH=0 +# BeamSearchStrategy args +NUM_BEAMS=4 +LENGTH_PENALTY=1.0 +NO_REPEAT_NGRAM=3 +# BaseStrategy args +TEMP=1.0 +TOPK=0 +TOPP=0.7 + +ARGS="${main_dir}/generate.py \ + --seed $SEED \ + --mode inference \ + --sampling-strategy BaseStrategy \ + --out-seq-length $MAX_OUTPUT_LENGTH \ + --min-gen-length $MIN_GEN_LENGTH \ + --num-beams $NUM_BEAMS \ + --length-penalty $LENGTH_PENALTY \ + --no-repeat-ngram-size $NO_REPEAT_NGRAM \ + --temperature $TEMP \ + --top_k $TOPK \ + --top_p $TOPP \ + --output-path samples \ + $MODEL_ARGS \ + $*" + +run_cmd="torchrun --nproc_per_node $MP_SIZE ${ARGS}" +eval ${run_cmd} diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/generate_bind_kernel.sh b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/generate_bind_kernel.sh new file mode 100644 index 0000000000000000000000000000000000000000..aea2526570ea513ccb09e85c4d554ae3781edbc8 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/scripts/generate_bind_kernel.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +script_path=$(realpath $0) +script_dir=$(dirname $script_path) +main_dir=$(dirname $script_dir) + +source "${main_dir}/configs/model_glm_130b_int8.sh" +# source "${main_dir}/configs/model_glm_130b.sh" + +SEED=1234 +MAX_OUTPUT_LENGTH=256 +MIN_GEN_LENGTH=0 +# BeamSearchStrategy args +NUM_BEAMS=4 +LENGTH_PENALTY=1.0 +NO_REPEAT_NGRAM=3 +# BaseStrategy args +TEMP=1.0 +TOPK=0 +TOPP=0.7 + +# need to prepare input.txt for input-source +ARGS="${main_dir}/generate.py \ + --seed $SEED \ + --mode inference \ + --sampling-strategy BaseStrategy \ + --out-seq-length $MAX_OUTPUT_LENGTH \ + --min-gen-length $MIN_GEN_LENGTH \ + --num-beams $NUM_BEAMS \ + --length-penalty $LENGTH_PENALTY \ + --no-repeat-ngram-size $NO_REPEAT_NGRAM \ + --temperature $TEMP \ + --top_k $TOPK \ + --top_p $TOPP \ + --output-path samples \ + --input-source ./input.txt \ + $MODEL_ARGS \ + $*" + +# task start with binding kernel +WORLD_SIZE=8 + +KERNEL_NUM=$(($(nproc)/$WORLD_SIZE)) +for((RANK_ID=0;RANK_ID<${WORLD_SIZE};RANK_ID++)); +do + export WORLD_SIZE=$WORLD_SIZE + export RANK_ID=$RANK_ID + export RANK=$RANK_ID + export LOCAL_RANK=$RANK_ID + if [ $(uname -m) = 'aarch64' ] + then + PID_START=$((KERNEL_NUM * RANK_ID)) + PID_END=$((PID_START + KERNEL_NUM - 1)) + nohup taskset -c $PID_START-$PID_END python ${ARGS} > logs/device_${RANK_ID}.log 2>&1 & + else + echo not aarch64, bind kernel take no effect. + nohup python ${ARGS} > logs/device_${RANK_ID}.log 2>&1 & + fi + echo RANK ${RANK_ID} task started. log is stored in ./logs/device_${RANK_ID}.log +done diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_cola.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_cola.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1402630e79556f081d900280a8754f9e08e79b97 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_cola.yaml @@ -0,0 +1,6 @@ +name: 'glue_cola' +type: 'mul' +path: 'bloom/glue_cola' +file-pattern: + validation: "**/validation.jsonl" +micro-batch-size: 30 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_mnli.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_mnli.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6cc8e7aab39c158aeedffacbad6016e3b58977aa --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_mnli.yaml @@ -0,0 +1,7 @@ +name: 'glue_mnli' +type: 'mul' +path: 'bloom/glue_mnli' +file-pattern: + validation-matched: "**/validation_matched.jsonl" + validation-mismatched: "**/validation_mismatched.jsonl" +micro_batch_size: 8 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_qnli.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_qnli.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8e335d4ddc95806b96e9b96dc8778fed3e35f928 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_qnli.yaml @@ -0,0 +1,6 @@ +name: 'glue_qnli' +type: 'mul' +path: 'bloom/glue_qnli' +file-pattern: + validation: "**/validation.jsonl" +micro_batch_size: 6 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_wnli.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_wnli.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5f768f9d5a3bbd65ac1762b4a033815422d3d8a1 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/glue_wnli.yaml @@ -0,0 +1,6 @@ +name: 'glue_wnli' +type: 'mul' +path: 'bloom/glue_wnli' +file-pattern: + validation: "**/validation.jsonl" +micro_batch_size: 16 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/math_qa.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/math_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4fb6f56f77600323619102e59499efa95a23183a --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/math_qa.yaml @@ -0,0 +1,7 @@ +name: 'math_qa' +type: 'mul' +path: 'bloom/math_qa' +file-pattern: + validation: "**/validation.jsonl" + test: "**/test.jsonl" +micro_batch_size: 6 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/mc_taco.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/mc_taco.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0156d561a49556cc53b858517cfebd7093d996c7 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/mc_taco.yaml @@ -0,0 +1,6 @@ +name: 'mc_taco' +type: 'gen' +path: 'bloom/mc_taco' +file-pattern: + validation: "**/validation_pp.jsonl" + test: "**/test_pp.jsonl" \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/openbook_qa.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/openbook_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..63cc523044e5d7eb1acddb91923e6763e45130d7 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/openbook_qa.yaml @@ -0,0 +1,7 @@ +name: 'openbook_qa' +type: 'mul' +path: 'bloom/openbookqa_main' +file-pattern: + test: "**/test.jsonl" + validation: "**/validation.jsonl" +micro_batch_size: 18 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/pubmed_qa.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/pubmed_qa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3fac16be999a86767395f4bc057807f15c4ea947 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/pubmed_qa.yaml @@ -0,0 +1,6 @@ +name: 'pubmed_qa' +type: 'mul' +path: 'bloom/pubmed_qa_pqa_labeled' +file-pattern: + train: "**/train.jsonl" +micro_batch_size: 2 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/superglue_axb.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/superglue_axb.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ed031a449953caa3b427c6f1d5de25c86c7123dd --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/superglue_axb.yaml @@ -0,0 +1,6 @@ +name: 'superglue_axb' +type: 'mul' +path: 'bloom/super_glue_axb' +file-pattern: + test: "**/test.jsonl" +micro_batch_size: 16 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/superglue_axg.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/superglue_axg.yaml new file mode 100644 index 0000000000000000000000000000000000000000..99ad0e9068672ec210c536c63915607ccb74407e --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/bloom/superglue_axg.yaml @@ -0,0 +1,6 @@ +name: 'superglue_axg' +type: 'mul' +path: 'bloom/super_glue_axg' +file-pattern: + test: "**/test.jsonl" +micro_batch_size: 34 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/afqmc.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/afqmc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7687bd0bd4d36d8c6e778587231614291b0b2646 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/afqmc.yaml @@ -0,0 +1,4 @@ +name: 'AFQMC' +type: 'mul' +path: 'CLUE/afqmc' +micro_batch_size: 16 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/c3.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/c3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..36b44968b25f143a2802ce486e63411ecc50fedd --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/c3.yaml @@ -0,0 +1,4 @@ +name: 'C3' +type: 'mul' +path: 'CLUE/c3' +micro_batch_size: 2 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/cluewsc.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/cluewsc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..33e6c50c63f18dd8cb99391eaf368fb570313fd7 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/cluewsc.yaml @@ -0,0 +1,4 @@ +name: 'CLUEWSC2020' +type: 'mul' +path: 'CLUE/cluewsc' +micro_batch_size: 18 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/cmnli.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/cmnli.yaml new file mode 100644 index 0000000000000000000000000000000000000000..087ef6f28d17a556b51d69ca7044574d7536dcc9 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/cmnli.yaml @@ -0,0 +1,4 @@ +name: 'CMNLI' +type: 'mul' +path: 'CLUE/cmnli' +micro_batch_size: 16 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/cmrc2018.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/cmrc2018.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3eef828d8f47e6bb6229086616b2a2586cd85a7d --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/cmrc2018.yaml @@ -0,0 +1,3 @@ +name: "CMRC2018" +type: "gen" +path: "CLUE/cmrc2018" diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/csl.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/csl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..12bd89d32261113ce3d61a934d97c75e3a787c3c --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/csl.yaml @@ -0,0 +1,4 @@ +name: 'CSL' +type: 'mul' +path: 'CLUE/csl' +micro_batch_size: 3 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/drcd.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/drcd.yaml new file mode 100644 index 0000000000000000000000000000000000000000..88ead2c7dcadef7e48ae92c665241c6cd1eb7daf --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/drcd.yaml @@ -0,0 +1,3 @@ +name: "DRCD" +type: "gen" +path: "CLUE/drcd" diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/ocnli.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/ocnli.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ed13dfeec652d6f07ac1d2f7898b0909c54bd5cb --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/clue/ocnli.yaml @@ -0,0 +1,4 @@ +name: 'OCNLI_50K' +type: 'mul' +path: 'CLUE/ocnli' +micro_batch_size: 24 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/bustm.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/bustm.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a9a22be5be87d995c9fd4d53e28f05e8dbcb6e9b --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/bustm.yaml @@ -0,0 +1,7 @@ +name: 'BUSTM' +type: 'mul' +path: 'CLUE/bustm' +file-pattern: + dev: "**/dev_few_all.jsonl" + test: "**/test_public.jsonl" +micro_batch_size: 56 diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/chidf.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/chidf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..13a84c39f25388fbdc1504e093b6841a7e84db7f --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/chidf.yaml @@ -0,0 +1,7 @@ +name: 'CHIDF' +type: 'mul' +path: 'CLUE/chid-fc' +file-pattern: + dev: "**/dev_few_all.jsonl" + test: "**/test_public.jsonl" +micro_batch_size: 16 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/cluewscf.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/cluewscf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d449f22557d7e17d9922ea085f39a7b9c8d13ce3 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/cluewscf.yaml @@ -0,0 +1,7 @@ +name: 'CLUEWSCF' +type: 'mul' +path: 'CLUE/cluewsc-fc' +file-pattern: + dev: "**/dev_few_all.jsonl" + test: "**/test_public.jsonl" +micro_batch_size: 16 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/cslf.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/cslf.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c749a5032f7f7a57060c1348312c6a33837ab3e8 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/cslf.yaml @@ -0,0 +1,7 @@ +name: 'CSLF' +type: 'mul' +path: 'CLUE/csl-fc' +file-pattern: + dev: "**/dev_few_all.jsonl" + test: "**/test_public.jsonl" +micro_batch_size: 2 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/eprstmt.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/eprstmt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..81141fa8974672aa01e697a0b9752aa9b42fe828 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/eprstmt.yaml @@ -0,0 +1,7 @@ +name: 'EPRSTMT' +type: 'mul' +path: 'CLUE/eprstmt-fc' +file-pattern: + dev: "**/dev_few_all.jsonl" + test: "**/test_public.jsonl" +micro_batch_size: 6 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/ocnlif.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/ocnlif.yaml new file mode 100644 index 0000000000000000000000000000000000000000..38b50951f45d5c533ba254849605f1e5fddea6da --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/chinese/fewclue/ocnlif.yaml @@ -0,0 +1,7 @@ +name: 'OCNLIF' +type: 'mul' +path: 'CLUE/ocnli-fc' +file-pattern: + dev: "**/dev_few_all.jsonl" + test: "**/test_public.jsonl" +micro_batch_size: 24 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/crows-pair/crows-pair.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/crows-pair/crows-pair.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e6a33c7ec66596c1c6c63f0249cd8dcec8e21bb8 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/crows-pair/crows-pair.yaml @@ -0,0 +1,8 @@ +name: "CROWS" +type: "mul" +path: "data" +module: "tasks.ethnic.crows-pair.tasks.CrowsPairTask" +file-pattern: + test: "**/crows-pair-dataset.jsonl" + +micro-batch-size: 1 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/crows-pair/tasks.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/crows-pair/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..d9149b2d894ed74d91b2b4350ae16692558b50e8 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/crows-pair/tasks.py @@ -0,0 +1,114 @@ +from os.path import join +from typing import Dict, Tuple, List +from abc import ABC +from collections import defaultdict +from evaluation import ( + MultiChoiceTask, + MultiChoiceTaskConfig, +) +from evaluation.dataset import ( + MultiChoiceTaskDataset, +) +from evaluation.utils import ( + print_rank_0, + get_tokenized_input, +) + + +class CrowsPairTask(MultiChoiceTask, ABC): + config: MultiChoiceTaskConfig + + def build_dataset(self, relative_path): + return CrowsPairDataset(join(self.config.path, relative_path), self.config) + + def predict_single_batch(self, batch) -> List[int]: + log_probs = self.model.cond_log_prob(batch) + return log_probs + + def CrowsPairMetric(self, predictions, examples): + print_rank_0("Special metric for CrowsPair") + results = defaultdict(float) + labels = defaultdict() + for prediction, example in zip(predictions, examples): + prediction = prediction[0] + if example["sent_ID"] == 1: + results[example["pair_ID"]] = results[example["pair_ID"]] + prediction + else: + results[example["pair_ID"]] = results[example["pair_ID"]] - prediction + labels[example["pair_ID"]] = example["bias_type"] + cat_postivie = defaultdict(int) + cat_tt = defaultdict(int) + final = defaultdict(int) + for val1, val2 in zip(results.values(), labels.values()): + if val1 >= 0: + cat_postivie[val2] = cat_postivie[val2] + 1 + else: + cat_postivie[val2] = cat_postivie[val2] + cat_tt[val2] = cat_tt[val2] + 1 + for key, val in cat_postivie.items(): + final[key] = val / cat_tt[key] + return final + + def report_single_metrics(self, file: str, result_dict: Dict[str, float]): + pass + + @property + def metrics(self): + return {"CP": self.CrowsPairMetric} + + def report_group_metrics(self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, float], int]], level=1): + for result in result_dict_group.values(): + result = result[0] + for value1 in result.items(): + value1 = value1[1] + for key, value in value1.items(): + print_rank_0("category:{cat} score:{score}".format(cat=key, score=round(value * 100,2))) + + +class CrowsPairDataset(MultiChoiceTaskDataset): + + config: MultiChoiceTaskConfig + + def __init__(self, path, config: MultiChoiceTaskConfig): + self.is_single_token = True # set to False later in process_single_item func + self.eval_data = [] + super().__init__(path, config) + + def process_single_item(self, item): + text, choices, label = ( + get_tokenized_input(item, "inputs"), + get_tokenized_input(item, "choices"), + item["label"], + ) + pair_ID, sent_ID, bias_type = ( + item["pair_ID"], + item["sent_ID"], + item["bias_type"], + ) + tgt_seq_length = sum([len(choice) for choice in choices]) + if tgt_seq_length == len(choices): + # For single token, we only insert one [sop] + tgt_seq_length = 1 + + assert tgt_seq_length < self.config.max_seq_length + if len(text) + tgt_seq_length + 2 > self.config.max_seq_length: + text_length = self.config.max_seq_length - tgt_seq_length - 2 + text = text[len(text) - text_length : len(text)] + + assert not ( + self.mask_id in text and self.config.use_multitask_encoding + ), "Unified multitask encoding don't support blank filling" + + if tgt_seq_length != 1: + self.is_single_token = False + + dataset = { + "text": text, + "choices": choices, + "label": label, + "pair_ID": pair_ID, + "sent_ID": sent_ID, + "bias_type": bias_type, + } + + return dataset diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-fewshot-multi.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-fewshot-multi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c1e080dd9b2155ba7736ac85ee0efc1c2d59893d --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-fewshot-multi.yaml @@ -0,0 +1,7 @@ +name: "ETHOS_fewshot_multi" +type: "mul" +path: "data" +file-pattern: + test: "**/ethos-few-shot-multi.jsonl" + +micro-batch-size: 32 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-fewshot-single.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-fewshot-single.yaml new file mode 100644 index 0000000000000000000000000000000000000000..100fb2e78589f2182aaee615005cc89fa72f3daf --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-fewshot-single.yaml @@ -0,0 +1,7 @@ +name: "ETHOS_fewshot_single" +type: "mul" +path: "data" +file-pattern: + test: "**/ethos-few-shot-single.jsonl" + +micro-batch-size: 32 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-oneshot.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-oneshot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b05b7bec41ce8b90e7a314fbe8fc2cbcecf7e46f --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-oneshot.yaml @@ -0,0 +1,7 @@ +name: "ETHOS_oneshot" +type: "mul" +path: "data" +file-pattern: + test: "**/ethos-one-shot.jsonl" + +micro-batch-size: 64 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-zeroshot.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-zeroshot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9e4567930828e40128dd2de6482779fe050fd732 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/ethos/ethos-zeroshot.yaml @@ -0,0 +1,7 @@ +name: "ETHOS_zeroshot" +type: "mul" +path: "data" +file-pattern: + test: "**/ethos-zero-shot.jsonl" + +micro-batch-size: 128 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/stereoset/stereoset.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/stereoset/stereoset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..afea2458c61f205fd3f014321cc0e0b84330c9a1 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/stereoset/stereoset.yaml @@ -0,0 +1,9 @@ +name: "StereoSet" +type: "mul" +path: "data" +module: "tasks.ethnic.stereoset.tasks.StereoSetTask" +use_task_mask: True +file-pattern: + test: "**/stereoset-dataset.jsonl" + +micro-batch-size: 64 diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/stereoset/tasks.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/stereoset/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..9f8aa381c99f9e6d1744696876ccabcd60caab3e --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/ethnic/stereoset/tasks.py @@ -0,0 +1,126 @@ +from os.path import join +from collections import defaultdict +from abc import ABC +import numpy as np +from typing import Dict, Tuple, List +from evaluation import ( + MultiChoiceTask, + MultiChoiceTaskConfig, +) +from evaluation.dataset import ( + MultiChoiceTaskDataset, +) +from evaluation.utils import ( + print_rank_0, + get_tokenized_input, +) + + +class StereoSetTask(MultiChoiceTask, ABC): + config: MultiChoiceTaskConfig + + def build_dataset(self, relative_path): + return StereoSetDataset(join(self.config.path, relative_path), self.config) + + def predict_single_batch(self, batch) -> List[int]: + log_probs = self.model.cond_log_prob(batch) + normalize_log_probs = [] + for origin_datas, predicts in zip(batch.get("choices"), log_probs): + normalize_log_probs_single = [] + for origin_data, predict in zip(origin_datas, predicts): + normalize_log_probs_single.append(predict / len(origin_data)) + normalize_log_probs.append(normalize_log_probs_single) + return [np.argmax(log_probs_single).item() for log_probs_single in normalize_log_probs] + + def report_group_metrics(self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, float], int]], level=1): + for tmp1 in result_dict_group.values(): + tmp1 = tmp1[0] + for result in tmp1.values(): + print("LMS") + for key, val in result[0].items(): + print_rank_0("cat:{key} score:{score}".format(key=key, score=round(val, 2))) + print("SS") + for key, val in result[1].items(): + print_rank_0("cat:{key} score:{score}".format(key=key, score=round(val, 2))) + print("ICAT") + for key, val in result[2].items(): + print_rank_0("cat:{key} score:{score}".format(key=key, score=round(val, 2))) + + def StereoSetMetric(self, predictions, examples): + print_rank_0("Special metric for StereoSet") + assert len(predictions) == len(examples) + results = defaultdict(list) + for prediction, example in zip(predictions, examples): + # print(prediction,example["goal_label"],example["goal_label"][prediction]) + predict_label = example["goal_label"][prediction] + example_type = example["bias_type"] + results[example_type].append(predict_label) + results["overall"].append(predict_label) + ss = {} + lms = {} + icat = {} + for key, values in results.items(): + label_dict = {"unrelated": 0, "stereotype": 0, "anti-stereotype": 0} + for label_keys in values: + label_dict[label_keys] = label_dict.get(label_keys, 0) + 1 + tt = len(values) + lms[key] = (label_dict["anti-stereotype"] + label_dict["stereotype"]) / tt * 100 + if label_dict["stereotype"] + label_dict["anti-stereotype"] == 0: + ss[key] = 0 + else: + ss[key] = label_dict["stereotype"] / (label_dict["anti-stereotype"] + label_dict["stereotype"]) * 100 + + icat[key] = lms[key] * (min(ss[key], 100.0 - ss[key]) / 50.0) + return [lms, ss, icat] + + def report_single_metrics(self, file: str, result_dict: Dict[str, float]): + pass + + @property + def metrics(self): + return {"SS_ICAT": self.StereoSetMetric} + + +class StereoSetDataset(MultiChoiceTaskDataset): + config: MultiChoiceTaskConfig + + def __init__(self, path, config: MultiChoiceTaskConfig): + self.is_single_token = True # set to False later in process_single_item func + self.eval_data = [] + super().__init__(path, config) + + def process_single_item(self, item): + text, choices, label = ( + get_tokenized_input(item, "inputs"), + get_tokenized_input(item, "choices"), + item["label"], + ) + # "ID":example.ID,"bias_type":example.bias_type,"goal_label":goal_label + ID, bias_type, goal_label = item["ID"], item["bias_type"], item["goal_label"] + tgt_seq_length = sum([len(choice) for choice in choices]) + if tgt_seq_length == len(choices): + # For single token, we only insert one [sop] + tgt_seq_length = 1 + + assert tgt_seq_length < self.config.max_seq_length + if len(text) + tgt_seq_length + 2 > self.config.max_seq_length: + text_length = self.config.max_seq_length - tgt_seq_length - 2 + text = text[len(text) - text_length : len(text)] + + assert not ( + self.mask_id in text and self.config.use_multitask_encoding + ), "Unified multitask encoding don't support blank filling" + + if tgt_seq_length != 1: + self.is_single_token = False + + dataset = { + "text": text, + "choices": choices, + "label": label, + "ID": ID, + "bias_type": bias_type, + "goal_label": goal_label, + } + + return dataset diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/lambada-unidirectional.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/lambada-unidirectional.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f725a476bf17e54bb40e01614fc96333ab0d2eee --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/lambada-unidirectional.yaml @@ -0,0 +1,13 @@ +name: "LAMBADA-unidirectional" +type: "gen" +module: "tasks.lambada.task.LAMBADA" +path: "lambada/lambada" +file-pattern: + test: "**/test.jsonl" + validation: "**/validation.jsonl" + +sampling_strategy: "BeamSearchStrategy" +num_beams: 16 +max_gen_length: 5 +use_task_mask: true +unidirectional: true diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/lambada.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/lambada.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2fb12f3d683d865b5dd44c9a0e16418a90525e96 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/lambada.yaml @@ -0,0 +1,12 @@ +name: "LAMBADA" +type: "gen" +module: "tasks.lambada.task.LAMBADA" +path: "lambada/lambada" +file-pattern: + test: "**/test.jsonl" + validation: "**/validation.jsonl" + +sampling_strategy: "BeamSearchStrategy" +num_beams: 16 +max_gen_length: 5 +use_task_mask: true diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/strategy.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..568fe3ac5cb5eba2b0b456ef31f85d937faffb54 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/strategy.py @@ -0,0 +1,21 @@ +from generation import BeamSearchStrategy + + +class BeamSearchStrategyForLAMBADA(BeamSearchStrategy): + def __init__(self, *args, banned_prefix=[], **kwargs): + super().__init__(*args, **kwargs) + self.banned_prefix = banned_prefix + + def forward(self, logits, tokens, mems): + batch_size, num_beams, vocab_size = logits.shape + logits = logits.float() + for prefix in self.banned_prefix: + if self.length_generated == len(prefix) - 1: + if len(prefix) == 1: + logits[..., prefix[0]] = -65504 + else: + for i in range(batch_size): + for j in range(num_beams): + if tokens[i, j, -(len(prefix) - 1) :].tolist() == prefix[:-1]: + logits[i, j, prefix[-1]] = -65504 + return super().forward(logits, tokens, mems) diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/task.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/task.py new file mode 100644 index 0000000000000000000000000000000000000000..5c2ba0102919eae95402370dd40baf2f65c13edc --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/lambada/task.py @@ -0,0 +1,61 @@ +from string import punctuation +from functools import partial +from typing import List + +from evaluation import qa_evaluate, GenerationTask + +from .strategy import BeamSearchStrategyForLAMBADA + + +def exact_match_score(prediction, ground_truth): + return prediction.strip() == ground_truth.strip() + + +class LAMBADA(GenerationTask): + @property + def metrics(self): + return {"Accuracy": partial(qa_evaluate, metric=exact_match_score)} + + def __init__(self, model, tokenizer, config_path): + super(LAMBADA, self).__init__(model, tokenizer, config_path) + + if self.config.sampling_strategy == "BeamSearchStrategy": + banned_prefix = [[46010], [146337]] # "'" and "``" + invalid_slices = [20068, 146010, 146337] + for p in punctuation: + pp = tokenizer.tokenize(p) + if len(pp) == 1: + invalid_slices.append(pp[0]) + banned_prefix.append(pp) + self.strategy = BeamSearchStrategyForLAMBADA( + batch_size=self.config.micro_batch_size, + num_beams=self.config.num_beams, + length_penalty=self.config.length_penalty, + consider_end=True, + end_tokens=self.strategy.end_tokens, + invalid_slices=invalid_slices, + banned_prefix=banned_prefix, + no_repeat_ngram_size=self.config.no_repeat_ngram_size, + min_gen_length=self.config.min_gen_length, + deterministic=True, + ) + + def get_first_word_tokens(self, tokens): + text = self.tokenizer.tokenizer.decode(tokens).strip() + return self.tokenizer.tokenize(text.split(" ")[0]) + + def predict_single_batch(self, batch): + outputs_batch: List[List[List[int]]] = self.model.generate_text(batch, self.strategy, return_all_beams=True) + predictions = [] + for outputs in outputs_batch: + found = False + for output in outputs: + text = self.tokenizer.tokenizer.decode(output).strip() + spl = text.split(" ") + if len(spl) >= 2 and spl[1] in punctuation: + predictions.append(self.get_first_word_tokens(output)) + found = True + break + if not found: + predictions.append(self.get_first_word_tokens(outputs[0])) + return predictions diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/pile.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/pile.py new file mode 100644 index 0000000000000000000000000000000000000000..1a58741b231d6d7107e2106e63a69f9d87ed33eb --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/pile.py @@ -0,0 +1,83 @@ +import os +import math +import json + +from typing import * +from os.path import join +from bisect import bisect_right +from itertools import accumulate +from collections import defaultdict + +from evaluation import LanguageModelTask, LanguageModelTaskDataset, print_rank_0 + + +def calculate_bpb_score(loss: List[float], data: List[Dict]): + loss_per_category = defaultdict(lambda: 0.0) + utf8_length_per_category = defaultdict(lambda: 0.0) + weights = [] + for item in data: + weights.append(item["num_sequences"]) + utf8_length_per_category[item["meta"]["pile_set_name"]] += item["utf8_length"] + weights = list(accumulate(weights)) + for idx in range(len(loss)): + document_idx = bisect_right(weights, idx) + loss_per_category[data[document_idx]["meta"]["pile_set_name"]] += loss[idx] + return { + name: (loss_per_category[name] / utf8_length_per_category[name] / math.log(2)) for name in loss_per_category + } + + +class Pile(LanguageModelTask): + @property + def metrics(self) -> Dict[str, Callable]: + return {"BPB": calculate_bpb_score} + + def build_dataset(self, relative_path): + return PileDataset(join(self.config.path, relative_path), self.config) + + def report_single_metrics(self, file: str, result_dict: Dict[str, float]): + pass + + def report_group_metrics( + self, group_name, result_dict_group: Dict[str, Tuple[Dict[str, Dict[str, float]], int]], level=1 + ): + output_str = f" Finish group {group_name}:\n" + result = list(result_dict_group.values())[0][0]["BPB"] + for key, value in result.items(): + output_str += f" {key} = {value:.3f}\n" + print_rank_0(output_str) + pass + + def report_overall_metrics(self, result_dict_all: Dict[str, Tuple[Dict[str, float], int]]): + pass + + +class PileDataset(LanguageModelTaskDataset): + def __len__(self): + return self.weights[-1] + + def process_single_file(self, path): + num_sequences = [] + with open(os.path.join(path), "r", encoding="utf-8") as file: + for line in file: + item = json.loads(line) + if len(item["text"]) == 0: + continue + self.data.append( + { + "raw_text": item["text"], + "utf8_length": len(item["text_pretokenized"].encode("utf-8")), + "num_sequences": max( + math.ceil( + max(len(item["text"]) - (self.config.max_seq_length - 1), 0) + / self.config.generation_length + ) + + 1, + 1, + ), + "meta": item["meta"], + } + ) + num_sequences.append(self.data[-1]["num_sequences"]) + self.weights = list(accumulate(num_sequences)) + self.left_weights = [0] + self.weights[:-1] diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/pile.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/pile.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8fb53da2228b6d3d22236fb0f8ebc1d662db6341 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/pile.yaml @@ -0,0 +1,10 @@ +name: "Pile" +type: "lm" +module: "tasks.language-modeling.pile.Pile" +path: "pile" +file-pattern: + test: "**/test_tokenized.jsonl" +# validation: "**/val_tokenized.jsonl" + +generation-length: 1024 +use_task_mask: true diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/ptb.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/ptb.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f1c651370411f6c564419ba1f0523b192a8a5a67 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/ptb.yaml @@ -0,0 +1,8 @@ +name: "Penn Treebank" +type: "lm" +path: "ptbdataset" +file-pattern: + test: "**/ptb.test.txt" + +generation-length: 256 +use_task_mask: true diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/wikitext-103.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/wikitext-103.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1483cc50d83a05cf0b7efc308fbf9c5637ca684c --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/wikitext-103.yaml @@ -0,0 +1,8 @@ +name: "WikiText-103" +type: "lm" +path: "wikitext-103" +file-pattern: + test: "**/wiki.test.tokens" + +generation-length: 256 +use_task_mask: true diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/wikitext-2.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/wikitext-2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a3894a835da5b5de2b2f1220d85590e821ea8956 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/language-modeling/wikitext-2.yaml @@ -0,0 +1,8 @@ +name: "WikiText-2" +type: "lm" +path: "wikitext-2" +file-pattern: + test: "**/wiki.test.tokens" + +generation-length: 256 +use_task_mask: true diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/mmlu/mmlu.yaml b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/mmlu/mmlu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bc6f0ff984a59a9923c2b5e1fb00e470bab4f370 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/mmlu/mmlu.yaml @@ -0,0 +1,10 @@ +name: "MMLU" +type: "mul" +module: "tasks.mmlu.task.MMLU" +path: "MMLU" +file-pattern: + stem: "stem/*.json" + social_sciences: "social_sciences/*.json" + humanities: "humanities/*.json" + other: "other/*.json" +micro-batch-size: 1 \ No newline at end of file diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/mmlu/task.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/mmlu/task.py new file mode 100644 index 0000000000000000000000000000000000000000..9c6117331793cc391da45c0806dda9f2b317e39c --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tasks/mmlu/task.py @@ -0,0 +1,78 @@ +import numpy as np + +from typing import Dict, Tuple + +from evaluation import MultiChoiceTask + +categories = { + "STEM": [ + "Abstract Algebra", + "Anatomy", + "Astronomy", + "College Biology", + "College Chemistry", + "College Computer Science", + "College Mathematics", + "College Physics", + "Computer Security", + "Conceptual Physics", + "Electrical Engineering", + "Elementary Mathematics", + "High School Biology", + "High School Chemistry", + "High School Computer Science", + "High School Mathematics", + "High School Physics", + "High School Statistics", + "Machine Learning", + ], + "Other": [ + "Business Ethics", + "Clinical Knowledge", + "College Medicine", + "Global Facts", + "Human Aging", + "Management", + "Marketing", + "Medical Genetics", + "Miscellaneous", + "Nutrition", + "Professional Accounting", + "Professional Medicine", + "Virology", + ], + "Social Sciences": [ + "Econometrics", + "High School Geography", + "High School Government and Politics", + "High School Macroeconomics", + "High School Microeconomics", + "High School Psychology", + "Human Sexuality", + "Professional Psychology", + "Public Relations", + "Security Studies", + "Sociology", + "US Foreign Policy", + ], + "Humanities": [ + "Formal Logic", + "High School European History", + "High School US History", + "High School World History", + "International Law", + "Jurisprudence", + "Logical Fallacies", + "Moral Disputes", + "Moral Scenarios", + "Philosophy", + "Prehistory", + "Professional Law", + "World Religions", + ], +} + + +class MMLU(MultiChoiceTask): + def report_overall_metrics(self, result_dict_all: Dict[str, Tuple[Dict[str, float], int]]): + self.report_group_metrics("Overall", result_dict_all, level=0) diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tools/__init__.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tools/convert_tp.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tools/convert_tp.py new file mode 100644 index 0000000000000000000000000000000000000000..aa7962d8876420f3134e4e094be167b6b6d4e3d0 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tools/convert_tp.py @@ -0,0 +1,154 @@ +import os +import sys +import torch +import argparse +import glob + +from typing import * + +sys.path.append(".") + +SEQUENTIAL_LAYERS = [ + "input_layernorm.weight", + "input_layernorm.bias", + "attention.dense.bias", + "post_attention_layernorm.weight", + "post_attention_layernorm.bias", + "mlp.dense_4h_to_h.bias", + "attention.rotary_emb.inv_freq", + "final_layernorm.weight", + "final_layernorm.bias", +] + +GLU_LAYERS = [ + "mlp.dense_h_to_4h.weight", + "mlp.dense_h_to_4h.bias", +] + +QUANTIZED_LAYERS = [ + "attention.dense.weight", + "attention.query_key_value.weight", + "mlp.dense_h_to_4h.weight", + "mlp.dense_4h_to_h.weight", +] + +LAYER_CONCAT_DIM = {"attention.dense.weight": 1, "mlp.dense_4h_to_h.weight": 1} + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument("--input-folder", default=None, type=str, help="Input SAT checkpoint folder") + parser.add_argument("--output-folder", default=None, type=str, help="Output SAT checkpoint folder") + parser.add_argument("--target-tp", default=4, type=int, help="Target TP degree") + parser.add_argument("--quantization-bit-width", default=None, type=int, help="Quantization bit width") + + args = parser.parse_args() + if args.quantization_bit_width is not None: + assert args.quantization_bit_width in [4, 8] + + return args + + +def merge_weights( + key: str, + sd_list: List[Dict], + tp_index: int, + original_tp: int, + target_tp: int, + cat_dim: int, + is_glu: bool, + quantization_bit_width: Optional[int], +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if original_tp >= target_tp: + if is_glu: + if original_tp > target_tp: + num_part = original_tp // target_tp + assert len(sd_list) == num_part + part1, part2 = [], [] + for i in range(len(sd_list)): + chunks = torch.chunk(sd_list[i][key], 2, dim=cat_dim) + part1.append(chunks[0]) + part2.append(chunks[1]) + merged_sd = torch.cat(part1 + part2, dim=cat_dim) + else: + merged_sd = sd_list[0][key] + else: + merged_sd = torch.cat([sd[key] for sd in sd_list], dim=cat_dim) + else: + assert len(sd_list) == 1 + num_part = target_tp // original_tp + if is_glu: + offset = tp_index % num_part + chunks = torch.chunk(sd_list[0][key], num_part * 2, dim=cat_dim) + merged_sd = torch.cat([chunks[offset], chunks[num_part + offset]], dim=cat_dim) + else: + # without clone, torch will save entire tensor + merged_sd = torch.chunk(sd_list[0][key], num_part, dim=cat_dim)[tp_index % num_part].clone() + + if quantization_bit_width is not None: + from kernels import compress_int4_weight + + weight = merged_sd.cuda() + weight_scale = (weight.abs().max(dim=-1).values / ((2 ** (quantization_bit_width - 1)) - 1)).half() + weight = torch.round(weight / weight_scale[:, None]).to(torch.int8) + if quantization_bit_width == 4: + weight = compress_int4_weight(weight) + return weight.cpu(), weight_scale.cpu() + + return merged_sd + + +def create_checkpoint( + sd_list: List[Dict], tp_index: int, original_tp: int, target_tp: int, quantization_bit_width: Optional[int] +) -> Dict: + new_sd = {} + for key in sd_list[0].keys(): + name = ".".join(key.split(".")[3 if key.startswith("transformer.layers") else 1 :]) + if name in SEQUENTIAL_LAYERS: + new_sd[key] = sd_list[0][key] + else: + new_sd[key] = merge_weights( + key, + sd_list, + tp_index=tp_index, + original_tp=original_tp, + target_tp=target_tp, + cat_dim=LAYER_CONCAT_DIM.get(name, 0), + is_glu=name in GLU_LAYERS, + quantization_bit_width=quantization_bit_width if name in QUANTIZED_LAYERS else None, + ) + if quantization_bit_width is not None and name in QUANTIZED_LAYERS: + new_sd[key], new_sd[f"{key}_scale"] = new_sd[key] + new_sd = {"module": new_sd} + return new_sd + + +def main(args): + iteration = open(os.path.join(args.input_folder, "latest"), "r").read() + original_tp = len(glob.glob(os.path.join(args.input_folder, iteration, "mp_rank_*_model_states.pt"))) + print(f"Iteration {iteration} from {args.input_folder} to {args.output_folder}") + os.makedirs(args.output_folder, exist_ok=True) + with open(os.path.join(args.output_folder, "latest"), "w") as file: + file.write(str(iteration)) + os.makedirs(os.path.join(args.output_folder, iteration), exist_ok=True) + + for i in range(0, args.target_tp): + save_path = os.path.join(args.output_folder, iteration, f"mp_rank_{i:02}_model_states.pt") + print(f"Processing {save_path}") + num_parts = original_tp // args.target_tp + sd_list = [ + torch.load( + os.path.join(args.input_folder, iteration, f"mp_rank_{j:02}_model_states.pt"), map_location="cpu" + )["module"] + for j in ( + range(i * num_parts, (i + 1) * num_parts) + if args.target_tp <= original_tp + else [i // (args.target_tp // original_tp)] + ) + ] + torch.save(create_checkpoint(sd_list, i, original_tp, args.target_tp, args.quantization_bit_width), save_path) + + +if __name__ == "__main__": + args = parse_arguments() + main(args) diff --git a/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tools/tokenize_pile.py b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tools/tokenize_pile.py new file mode 100644 index 0000000000000000000000000000000000000000..208c7a107f9fa2625d8e59a1f7721a8aefd96b29 --- /dev/null +++ b/PyTorch/built-in/nlp/GLM-130B_for_PyTorch/tools/tokenize_pile.py @@ -0,0 +1,24 @@ +import json +import tqdm +from icetk import icetk +from multiprocessing import Pool + +DATA_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val.jsonl" +OUTPUT_PATH = "/mnt/yrfs/aohan/data/english_data/pile/val_tokenized.jsonl" + + +def get_data(line): + item = json.loads(line) + item["text_pretokenized"] = item["text"] + item["text"] = icetk.encode(item["text_pretokenized"]) + return json.dumps(item) + "\n" + + +with open(DATA_PATH, "r") as file: + data = file.readlines() + +with Pool(16) as p: + result = list(tqdm.tqdm(p.imap(get_data, data), total=len(data))) + +with open(OUTPUT_PATH, "w") as file: + file.writelines(result)